information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

index.py (1115B)


      1 import json
      2 from sentence_transformers import SentenceTransformer
      3 import os
      4 from pathlib import Path
      5 
      6 NAME = 'Qwen/Qwen3-Embedding-0.6B'
      7 # expand '~'
      8 PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
      9 EMBEDDING_PATH = 'embeddings.json'
     10 DOCUMENT_PATH = Path(os.path.expanduser('~/gitRepos/notes/docs'))
     11 
     12 model = None
     13 
     14 if not Path.exists(PATH):
     15     os.makedirs(PATH)
     16     model = SentenceTransformer(NAME)
     17     print('downloaded model')
     18     model.save(str(PATH))
     19     print('saved model')
     20 
     21 model = SentenceTransformer(str(PATH))
     22 print('loaded model')
     23 
     24 
     25 docs = os.listdir(DOCUMENT_PATH)
     26 print(f"Found {len(docs)} docs")
     27 abs_path = [str(DOCUMENT_PATH) + '/' + name for name in docs]
     28 
     29 
     30 # load this all in mem. might be large depending on corpus...
     31 documents = {}
     32 count = 1
     33 
     34 # we assume each document fits within context length for sentence embedding
     35 for path in abs_path:
     36     with open(path, 'r') as f:
     37         documents[path] = model.encode(f.read()).tolist()
     38         print(f"{count}: Indexed {path}")
     39         count += 1
     40 
     41 dumped = json.dumps(documents)
     42 
     43 with open(EMBEDDING_PATH, 'w') as f:
     44     f.write(dumped)