index.py (1115B)
1 import json 2 from sentence_transformers import SentenceTransformer 3 import os 4 from pathlib import Path 5 6 NAME = 'Qwen/Qwen3-Embedding-0.6B' 7 # expand '~' 8 PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME) 9 EMBEDDING_PATH = 'embeddings.json' 10 DOCUMENT_PATH = Path(os.path.expanduser('~/gitRepos/notes/docs')) 11 12 model = None 13 14 if not Path.exists(PATH): 15 os.makedirs(PATH) 16 model = SentenceTransformer(NAME) 17 print('downloaded model') 18 model.save(str(PATH)) 19 print('saved model') 20 21 model = SentenceTransformer(str(PATH)) 22 print('loaded model') 23 24 25 docs = os.listdir(DOCUMENT_PATH) 26 print(f"Found {len(docs)} docs") 27 abs_path = [str(DOCUMENT_PATH) + '/' + name for name in docs] 28 29 30 # load this all in mem. might be large depending on corpus... 31 documents = {} 32 count = 1 33 34 # we assume each document fits within context length for sentence embedding 35 for path in abs_path: 36 with open(path, 'r') as f: 37 documents[path] = model.encode(f.read()).tolist() 38 print(f"{count}: Indexed {path}") 39 count += 1 40 41 dumped = json.dumps(documents) 42 43 with open(EMBEDDING_PATH, 'w') as f: 44 f.write(dumped)