process.py (851B)
1 import pandas as pd 2 import json 3 from sentence_transformers import SentenceTransformer 4 import os 5 from pathlib import Path 6 7 NAME = 'sentence-transformers/all-MiniLM-L6-v2' 8 # expand '~' 9 PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME) 10 EMBEDDING_PATH = 'embeddings.json' 11 DATASET_PATH = os.path.expanduser('~/datasets/ai_slop.csv') 12 13 model = None 14 15 if not Path.exists(PATH): 16 os.makedirs(PATH) 17 model = SentenceTransformer(NAME) 18 print('downloaded model') 19 model.save(str(PATH)) 20 print('saved model') 21 22 model = SentenceTransformer(str(PATH)) 23 print('loaded model') 24 25 def embed_doc(text): 26 return model.encode(text) 27 28 docs = pd.read_csv(DATASET_PATH) 29 print(f"Loaded {len(docs)} records") 30 31 docs['embedding'] = docs.iloc[:, 0].apply(embed_doc) 32 33 docs = docs.drop('text', axis = 1) 34 print(docs) 35 docs.to_csv('embeddings.csv', index=False)