-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_index.py
More file actions
95 lines (75 loc) · 2.58 KB
/
build_index.py
File metadata and controls
95 lines (75 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Build embeddings index using Gemini API from data.txt
Run this once to create the index.pkl file.
"""
import os
import pickle
import time
from dotenv import load_dotenv
import google.generativeai as genai
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
def get_embedding(text: str) -> list:
"""Get embedding using Gemini API."""
result = genai.embed_content(
model="models/text-embedding-004",
content=text
)
return result['embedding']
def parse_data_file(file_path: str) -> list:
"""Parse documents from data.txt file."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Split by PAGE: sections
sections = content.split('\nPAGE:')
documents = []
for section in sections:
section = section.strip()
if len(section) > 100: # Skip very short sections
# Limit chunk size to ~1500 chars for better retrieval
if len(section) > 1500:
documents.append(section[:1500])
else:
documents.append(section)
return documents
def build_index(documents: list, output_path: str):
"""Build and save embeddings index."""
print(f"Building embeddings for {len(documents)} documents...")
embeddings = []
texts = []
for i, doc in enumerate(documents):
print(f"Processing {i+1}/{len(documents)}...")
try:
emb = get_embedding(doc)
embeddings.append(emb)
texts.append(doc)
# Small delay to avoid rate limits
if (i + 1) % 10 == 0:
time.sleep(1)
except Exception as e:
print(f"Error on doc {i}: {e}")
time.sleep(2)
try:
emb = get_embedding(doc)
embeddings.append(emb)
texts.append(doc)
except:
print(f"Skipping doc {i}")
continue
# Save index
data = {
"embeddings": embeddings,
"texts": texts
}
with open(output_path, 'wb') as f:
pickle.dump(data, f)
print(f"Saved index with {len(embeddings)} documents to {output_path}")
if __name__ == '__main__':
script_dir = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(script_dir, "data.txt")
print(f"Reading from {data_path}...")
docs = parse_data_file(data_path)
print(f"Found {len(docs)} document sections")
# Save index.pkl in same directory
output_path = os.path.join(script_dir, "index.pkl")
build_index(docs, output_path)