forked from AllAboutAI-YT/easy-local-rag
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupload.py
124 lines (109 loc) · 5.79 KB
/
upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import tkinter as tk
from tkinter import filedialog
import PyPDF2
import re
import json
# Function to convert PDF to text and append to vault.txt
def convert_pdf_to_text():
file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
if file_path:
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
text = ''
for page_num in range(num_pages):
page = pdf_reader.pages[page_num]
if page.extract_text():
text += page.extract_text() + " "
# Normalize whitespace and clean up text
text = re.sub(r'\s+', ' ', text).strip()
# Split text into chunks by sentences, respecting a maximum chunk size
sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
chunks = []
current_chunk = ""
for sentence in sentences:
# Check if the current sentence plus the current chunk exceeds the limit
if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
current_chunk += (sentence + " ").strip()
else:
# When the chunk exceeds 1000 characters, store it and start a new one
chunks.append(current_chunk)
current_chunk = sentence + " "
if current_chunk: # Don't forget the last chunk!
chunks.append(current_chunk)
with open("vault.txt", "a", encoding="utf-8") as vault_file:
for chunk in chunks:
# Write each chunk to its own line
vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
print(f"PDF content appended to vault.txt with each chunk on a separate line.")
# Function to upload a text file and append to vault.txt
def upload_txtfile():
file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
if file_path:
with open(file_path, 'r', encoding="utf-8") as txt_file:
text = txt_file.read()
# Normalize whitespace and clean up text
text = re.sub(r'\s+', ' ', text).strip()
# Split text into chunks by sentences, respecting a maximum chunk size
sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
chunks = []
current_chunk = ""
for sentence in sentences:
# Check if the current sentence plus the current chunk exceeds the limit
if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
current_chunk += (sentence + " ").strip()
else:
# When the chunk exceeds 1000 characters, store it and start a new one
chunks.append(current_chunk)
current_chunk = sentence + " "
if current_chunk: # Don't forget the last chunk!
chunks.append(current_chunk)
with open("vault.txt", "a", encoding="utf-8") as vault_file:
for chunk in chunks:
# Write each chunk to its own line
vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
print(f"Text file content appended to vault.txt with each chunk on a separate line.")
# Function to upload a JSON file and append to vault.txt
def upload_jsonfile():
file_path = filedialog.askopenfilename(filetypes=[("JSON Files", "*.json")])
if file_path:
with open(file_path, 'r', encoding="utf-8") as json_file:
data = json.load(json_file)
# Flatten the JSON data into a single string
text = json.dumps(data, ensure_ascii=False)
# Normalize whitespace and clean up text
text = re.sub(r'\s+', ' ', text).strip()
# Split text into chunks by sentences, respecting a maximum chunk size
sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
chunks = []
current_chunk = ""
for sentence in sentences:
# Check if the current sentence plus the current chunk exceeds the limit
if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
current_chunk += (sentence + " ").strip()
else:
# When the chunk exceeds 1000 characters, store it and start a new one
chunks.append(current_chunk)
current_chunk = sentence + " "
if current_chunk: # Don't forget the last chunk!
chunks.append(current_chunk)
with open("vault.txt", "a", encoding="utf-8") as vault_file:
for chunk in chunks:
# Write each chunk to its own line
vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
print(f"JSON file content appended to vault.txt with each chunk on a separate line.")
# Create the main window
root = tk.Tk()
root.title("Upload .pdf, .txt, or .json")
# Create a button to open the file dialog for PDF
pdf_button = tk.Button(root, text="Upload PDF", command=convert_pdf_to_text)
pdf_button.pack(pady=10)
# Create a button to open the file dialog for text file
txt_button = tk.Button(root, text="Upload Text File", command=upload_txtfile)
txt_button.pack(pady=10)
# Create a button to open the file dialog for JSON file
json_button = tk.Button(root, text="Upload JSON File", command=upload_jsonfile)
json_button.pack(pady=10)
# Run the main event loop
root.mainloop()