Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions app/services/summarizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,48 @@
# 비즈니스 로직 / AI 추론 모듈

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
from typing import List

# 1. 모델 불러오기 및 4bit 양자화 설정
model_id = "nlpai-lab/KULLM3"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
print(f"Initial VRAM usage: {torch.cuda.memory_allocated() / (1024**3):.2f} GB")

start_load_time = time.time()
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto"
)
end_load_time = time.time()

print(f"\nModel loaded in {end_load_time - start_load_time:.2f} seconds")

if torch.cuda.is_available():
initial_vram_after_load = torch.cuda.memory_allocated()
peak_vram_after_load = torch.cuda.max_memory_allocated()
print(f"VRAM allocated after model load: {initial_vram_after_load / (1024**3):.2f} GB")
print(f"Peak VRAM used during model load: {peak_vram_after_load / (1024**3):.2f} GB")

# 2. LLM 채팅 프롬프트 포맷

def build_chat_prompt(prompt: str):
return f"<s>[INST] {prompt.strip()} [/INST]"

# 3. 프롬프트 생성 함수 (기존 유지)
def build_transform_prompt(title: str, content: str, level: str) -> str:
base = f"다음 뉴스 제목과 본문을 사용자의 이해 수준에 맞게 다시 써줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}\n"
if level == "상":
Expand All @@ -12,3 +55,43 @@ def build_transform_prompt(title: str, content: str, level: str) -> str:

def build_summary_prompt(title: str, content: str) -> str:
return f"다음 뉴스 제목과 본문을 한문장으로 간단히 요약해줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}"

# 4. 배치 추론 함수
def kullm_batch_generate(prompts: List[str], max_new_tokens=512):
chat_prompts = [build_chat_prompt(p) for p in prompts]
if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats()
inputs = tokenizer(chat_prompts, return_tensors="pt", padding=True).to(model.device)
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
start_infer_time = time.time()
output = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.2,
top_p=0.2,
pad_token_id=tokenizer.eos_token_id
)
end_infer_time = time.time()
generation_time = end_infer_time - start_infer_time
decoded_results = []
generated_tokens_list = []
for i in range(len(prompts)):
original_input_len = (input_ids[i] != tokenizer.pad_token_id).sum().item()
generated_tokens = output[i].shape[0] - original_input_len
generated_tokens_list.append(generated_tokens)
result_text = tokenizer.decode(output[i], skip_special_tokens=True)
decoded_results.append(result_text.split('[/INST]')[-1].strip())
current_vram = 0
peak_vram = 0
if torch.cuda.is_available():
current_vram = torch.cuda.memory_allocated()
peak_vram = torch.cuda.max_memory_allocated()
return decoded_results, generation_time, generated_tokens_list, current_vram, peak_vram

# 5. 단일 프롬프트용 generate_content 함수
def generate_content(prompt: str, max_new_tokens=512) -> str:
results, _, _, _, _ = kullm_batch_generate([prompt], max_new_tokens=max_new_tokens)
return results[0]
11 changes: 11 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
fastapi
uvicorn==0.29.0 # FastAPI 실행용 ASGI 서버
pydantic
python-dotenv
torch==2.7.1
transformers==4.51.3 #hf 모델 로드하고 추론 핵심 라이브러리
safetensors #HuggingFace 모델이 저장되는 .safetensors 파일 포맷을 빠르고 안전하게 로드하기 위한 라이브러리
accelerate>=0.20.3 # HuggingFace 모델 가속 및 디바이스 관리
huggingface-hub
sentencepiece #모델 토크나이저
bitsandbytes==0.42.0 # gpu용 4bit양자화