Leo-Rag-Flask-Server/build_index.py at main · Rexosphere/Leo-Rag-Flask-Server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Build embeddings index using Gemini API from data.txt
Run this once to create the index.pkl file.
"""

import os
import pickle
import time
from dotenv import load_dotenv
import google.generativeai as genai

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))


def get_embedding(text: str) -> list:
    """Get embedding using Gemini API."""
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=text
    )
    return result['embedding']


def parse_data_file(file_path: str) -> list:
    """Parse documents from data.txt file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split by PAGE: sections
    sections = content.split('\nPAGE:')
    documents = []

    for section in sections:
        section = section.strip()
        if len(section) > 100:  # Skip very short sections
            # Limit chunk size to ~1500 chars for better retrieval
            if len(section) > 1500:
                documents.append(section[:1500])
            else:
                documents.append(section)

    return documents


def build_index(documents: list, output_path: str):
    """Build and save embeddings index."""
    print(f"Building embeddings for {len(documents)} documents...")

    embeddings = []
    texts = []

    for i, doc in enumerate(documents):
        print(f"Processing {i+1}/{len(documents)}...")
        try:
            emb = get_embedding(doc)
            embeddings.append(emb)
            texts.append(doc)
            # Small delay to avoid rate limits
            if (i + 1) % 10 == 0:
                time.sleep(1)
        except Exception as e:
            print(f"Error on doc {i}: {e}")
            time.sleep(2)
            try:
                emb = get_embedding(doc)
                embeddings.append(emb)
                texts.append(doc)
            except:
                print(f"Skipping doc {i}")
                continue

    # Save index
    data = {
        "embeddings": embeddings,
        "texts": texts
    }

    with open(output_path, 'wb') as f:
        pickle.dump(data, f)

    print(f"Saved index with {len(embeddings)} documents to {output_path}")


if __name__ == '__main__':
    script_dir = os.path.dirname(os.path.abspath(__file__))
    data_path = os.path.join(script_dir, "data.txt")

    print(f"Reading from {data_path}...")
    docs = parse_data_file(data_path)
    print(f"Found {len(docs)} document sections")

    # Save index.pkl in same directory
    output_path = os.path.join(script_dir, "index.pkl")
    build_index(docs, output_path)