-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_and_save.py
39 lines (31 loc) · 1.36 KB
/
process_and_save.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
# Define paths
data_input_folder = 'data_input'
vector_store_folder = 'vector_store'
# Ensure vector_store folder exists
if not os.path.exists(vector_store_folder):
os.makedirs(vector_store_folder)
def process_and_save_to_vector_store(embedding_model="plutonioumguy/bge-m3"):
# Initialize embedding function and vector store
embedding_function = OllamaEmbeddings(model=embedding_model)
vectorstore = Chroma(persist_directory=vector_store_folder, embedding_function=embedding_function)
# Process all txt files in data_input
for filename in os.listdir(data_input_folder):
if filename.endswith('.txt'):
file_path = os.path.join(data_input_folder, filename)
with open(file_path, 'r', encoding='utf-8') as file:
text_content = file.read()
# Split text into chunks for embedding
chunks = text_content.split('\n\n') # Split by paragraphs
for chunk in chunks:
if chunk.strip():
vectorstore.add_texts([chunk])
# Remove processed file
os.remove(file_path)
print(f"Processed and deleted {filename}")
# Persist the vector store
vectorstore.persist()
if __name__ == "__main__":
process_and_save_to_vector_store()