information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

process.py (851B)


      1 import pandas as pd
      2 import json
      3 from sentence_transformers import SentenceTransformer
      4 import os
      5 from pathlib import Path
      6 
      7 NAME = 'sentence-transformers/all-MiniLM-L6-v2'
      8 # expand '~'
      9 PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
     10 EMBEDDING_PATH = 'embeddings.json'
     11 DATASET_PATH = os.path.expanduser('~/datasets/ai_slop.csv')
     12 
     13 model = None
     14 
     15 if not Path.exists(PATH):
     16     os.makedirs(PATH)
     17     model = SentenceTransformer(NAME)
     18     print('downloaded model')
     19     model.save(str(PATH))
     20     print('saved model')
     21 
     22 model = SentenceTransformer(str(PATH))
     23 print('loaded model')
     24 
     25 def embed_doc(text):
     26     return model.encode(text)
     27 
     28 docs = pd.read_csv(DATASET_PATH)
     29 print(f"Loaded {len(docs)} records")
     30 
     31 docs['embedding'] = docs.iloc[:, 0].apply(embed_doc)
     32 
     33 docs = docs.drop('text', axis = 1)
     34 print(docs)
     35 docs.to_csv('embeddings.csv', index=False)