-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllama2.py
168 lines (124 loc) · 5.15 KB
/
llama2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
import huggingface_hub
from langchain import HuggingFacePipeline
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from langchain import PromptTemplate, LLMChain
import re
import PyPDF2
from glob import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import math
def load_model(model_id):
print(f"Loading Model: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(
model_id,
cache_dir="./model/",
use_auth_token="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",
)
# model = AutoModelForCausalLM.from_pretrained(
# model_id,
# cache_dir="./model/",
# torch_dtype=torch.float16,
# trust_remote_code=True,
# load_in_8bit=True,
# use_auth_token="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",
# )
# generation_config = GenerationConfig.from_pretrained(model_id)
# # Create a pipeline for text generation
# pipe = pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# max_length=4096,
# top_p=0.95,
# repetition_penalty=1.15,
# generation_config=generation_config,
# )
# local_llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={"temperature": 0})
# print("Local LLM Loaded")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
return tokenizer, tokenizer, embedding_model
def clean_document_nemerics(text): # Cleans the text
return re.sub(r"(?:\b|(?<=\s))\w{1}\b|[\d\W]+", " ", text).lower()
def files_from_Stramlit(files):
full_String = ""
for file in files:
pdf_reader = PyPDF2.PdfReader(file)
# Extract the content
for page in range(len(pdf_reader.pages)):
full_String += pdf_reader.pages[page].extract_text()
cleaned_string = clean_document_nemerics(full_String)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2048,
chunk_overlap=128, # similar token len in overlap of text between chunks
is_separator_regex=False,
)
chunks = text_splitter.split_text(cleaned_string)
return chunks, cleaned_string
# def load_documents_and_chunks(directory):
# documents = []
# for item_path in glob(directory + "*.pdf"):
# loader = PyPDFLoader(item_path)
# documents.extend(loader.load())
# documents = [i.page_content for i in documents]
# full_string = " ".join(documents)
# cleaned_string = clean_document_nemerics(full_string)
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=2048,
# chunk_overlap=128, # similar token len in overlap of text between chunks
# is_separator_regex=False,
# )
# chunks = text_splitter.split_text(cleaned_string)
# return chunks, cleaned_string
def calc_tokens(text, tokenizer):
return len(tokenizer.tokenize(text))
def cluster_sentences(
sentences, embedding_model, distance_threshold=1.3
): # AgglomerativeClustering for vector clustering
sentence_embeddings = embedding_model.encode(sentences)
clustering_model = AgglomerativeClustering(
distance_threshold=distance_threshold, n_clusters=None, linkage="ward"
)
clustering_model.fit(sentence_embeddings)
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(clustering_model.labels_):
if cluster_id not in clustered_sentences:
clustered_sentences[cluster_id] = []
clustered_sentences[cluster_id].append(sentences[sentence_id])
return [cluster for cluster in clustered_sentences.values()]
def redistribution(listoflistofchunks, tokenizer):
limit = 1024
redistributed = []
for index, chunks in enumerate(listoflistofchunks):
tokens = calc_tokens("\n".join(chunks), tokenizer)
print(index, tokens)
if tokens < limit:
redistributed.extend([chunks])
else:
print(
f"chunk {index} of tokens {tokens} is splitted into {math.ceil(tokens/limit)} parts"
)
partitionlist = np.array_split(chunks, math.ceil(tokens / limit))
# res = [list(x) for x in partitionlist]
redistributed.extend([list(x) for x in partitionlist])
return redistributed
def get_summary(files, LLM, tokenizer, embedding_model):
chunks, cleaned_string = files_from_Stramlit(files)
clusters = cluster_sentences(chunks, embedding_model)
limit_clusters = redistribution(clusters, tokenizer)
tosendlangchain = ["\n".join(i) for i in limit_clusters]
res = []
for i in tosendlangchain:
res.append(calc_tokens(i, tokenizer))
return res
# tokenizer, LLM, embedding_model = load_model("krthk/llama-2-7b-chat-finetuned") #Huggingface model id
# chunks, raw_text = load_documents_and_chunks("./docs/")
# clusters = cluster_sentences(chunks, embedding_model)
# limit_clusters = redistribution(clusters)
# tosendlangchain = ["\n".join(i) for i in limit_clusters]
# for i in tosendlangchain:
# print(calc_tokens(i))