add LLM StremingMode, change Editor, add ReloadButton

Junseo1026 · Junseo1026 · commit 75980097a245 · 2025-07-07T00:25:01.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,4 @@ nohup.out
 __pycache__/
 venv
 ./venv
-uploads/*
+uploads/*
diff --git a/Makefile b/Makefile
@@ -1,3 +1,3 @@
 # Makefile
 run:
-	uvicorn main:app --host 0.0.0.0 --port 8080 --reload
+	uvicorn main:app --host 0.0.0.0 --port 8080 --reload --http h11
diff --git a/routers/file.py b/routers/file.py
@@ -177,14 +177,18 @@ def download_file(
     if not os.path.exists(file_path):
         raise HTTPException(status_code=404, detail="서버에 파일이 존재하지 않습니다.")
 
-    # original_name 을 percent-encoding 해서 ASCII 만으로 헤더 구성
-    filename_quoted = quote(file_obj.original_name)
-    content_disposition = f"inline; filename*=UTF-8''{filename_quoted}"
-
+    # filename_star = file_obj.original_name
+    # return FileResponse(
+    #     path=file_path,
+    #     media_type=file_obj.content_type,
+    #     headers={"Content-Disposition": f"inline; filename*=UTF-8''{filename_star}"}
+    # )
+     # FastAPI가 내부에서 UTF-8로 인코딩된 Content-Disposition 헤더를 생성해 줌
     return FileResponse(
         path=file_path,
         media_type=file_obj.content_type,
-        headers={"Content-Disposition": content_disposition}
+        filename=file_obj.original_name,
+        background=None
     )
 
 
diff --git a/routers/note.py b/routers/note.py
@@ -1,14 +1,18 @@
 import os
 from dotenv import load_dotenv
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
+from fastapi.responses import StreamingResponse
 from sqlalchemy.orm import Session
 from typing import List
 from datetime import datetime
+import traceback
 
-from db import get_db
+from db import get_db, SessionLocal
 from models.note import Note
 from schemas.note import NoteCreate, NoteUpdate, NoteResponse, FavoriteUpdate
 from utils.jwt_utils import get_current_user
+from fastapi.responses import StreamingResponse 
+from utils.llm import stream_summary_with_langchain
 
 load_dotenv()
 HF_TOKEN = os.getenv("HF_API_TOKEN")
@@ -140,35 +144,39 @@ def toggle_favorite(
     db.refresh(note)
     return note
 
-
-# 8) 노트 요약 (LLM 호출)
-@router.post("/notes/{note_id}/summarize", response_model=NoteResponse)
-def summarize_note(
+def save_summary(note_id: int, text: str):
+    db2 = SessionLocal()
+    try:
+        tgt = db2.query(Note).filter(Note.id == note_id).first()
+        if tgt:
+            tgt.content = text
+            tgt.updated_at = datetime.utcnow()
+            db2.commit()
+    finally:
+        db2.close()
+
+@router.post("/notes/{note_id}/summarize")
+async def summarize_stream_langchain(
     note_id: int,
+    background_tasks: BackgroundTasks,
     db: Session = Depends(get_db),
     user = Depends(get_current_user)
 ):
-    note = db.query(Note).filter(
-        Note.id == note_id, Note.user_id == user.u_id
-    ).first()
-    if not note:
-        raise HTTPException(status_code=404, detail="Note not found")
-
-    original = note.content or ""
-    if not original.strip():
-        raise HTTPException(status_code=400, detail="내용이 비어 있어 요약할 수 없습니다.")
-
-    # ────────────────────────────────────────────────────────────────────
-    # 실제 요약 함수 호출 (지연 임포트)
-    try:
-        from utils.llm import summarize_with_qwen3
-        summary_text = summarize_with_qwen3(original)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"요약 중 오류 발생: {e}")
-    # ────────────────────────────────────────────────────────────────────
-
-    note.content = summary_text
-    note.updated_at = datetime.utcnow()
-    db.commit()
-    db.refresh(note)
-    return note
+    note = db.query(Note).filter(Note.id == note_id, Note.user_id == user.u_id).first()
+    if not note or not (note.content or "").strip():
+        raise HTTPException(status_code=404, detail="요약 대상 없음")
+
+    async def event_gen():
+        parts = []
+        async for sse in stream_summary_with_langchain(note.content):
+            parts.append(sse.removeprefix("data: ").strip())
+            yield sse.encode()                    
+        full = "".join(parts).strip()
+        if full:
+            background_tasks.add_task(save_summary, note.id, full)
+
+    return StreamingResponse(
+        event_gen(),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache"}
+    )
diff --git a/utils/llm.py b/utils/llm.py
@@ -1,97 +1,50 @@
-# ~/noteflow/Backend/utils/llm.py
+from langchain.callbacks import AsyncIteratorCallbackHandler
+from langchain_ollama import ChatOllama
+from langchain.schema import HumanMessage, SystemMessage
+import re, asyncio
 
-import torch
-from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+_THOUGHT_PAT = re.compile(
+    r"^\s*(okay|let\s*me|i\s*need\s*to|first[, ]|then[, ]|next[, ]|in summary|먼저|그\s*다음|요약하면)",
+    re.I,
+)
 
-_MODEL_NAME = "Qwen/Qwen3-8B"
-
-# 전역 변수: 최초에는 토크나이저/모델이 None
-_tokenizer = None
-_model = None
-
-def _load_model():
-    """
-    summarize_with_qwen3()가 최초 호출될 때만 Qwen3-8B 모델과 토크나이저를 메모리에 로드합니다.
-    """
-    global _tokenizer, _model
-    if _model is None or _tokenizer is None:
-        # 1) Config 불러와서 parallel_style 지정
-        config = AutoConfig.from_pretrained(
-            _MODEL_NAME,
-            trust_remote_code=True
-        )
-        # 반드시 "auto"로 지정 (NoneType 오류 방지)
-        config.parallel_style = "auto"
-
-        # 2) 토크나이저 로드
-        _tokenizer = AutoTokenizer.from_pretrained(
-            _MODEL_NAME,
-            trust_remote_code=True
-        )
-
-        # 3) 모델 로드 시 config 인자 추가
-        _model = AutoModelForCausalLM.from_pretrained(
-            _MODEL_NAME,
-            config=config,            # custom config 전달
-            torch_dtype="auto",
-            device_map="auto",
-            trust_remote_code=True
-        )
-        _model.eval()
-
-
-def summarize_with_qwen3(
-    text: str,
-    max_new_tokens: int = 256,
-    temperature: float = 0.6
-) -> str:
+async def stream_summary_with_langchain(text: str):
     """
-    - 한국어 문서를 간결하고 핵심적으로 요약
-    - 반환값: 요약된 한국어 문자열
+    LangChain + Ollama에서 토큰을 비동기로 받아
+    SSE("data: ...\\n\\n") 형식으로 yield 하는 async generator
     """
-    # 모델/토크나이저가 아직 로드되지 않았다면, 이 시점에만 로드
-    if _model is None or _tokenizer is None:
-        _load_model()
+    # 1) LangChain용 콜백 핸들러
+    cb = AsyncIteratorCallbackHandler()
+
+    # 2) Ollama Chat 모델 (streaming=True)
+    llm = ChatOllama(
+        base_url="http://localhost:11434",
+        model="qwen3:8b",
+        streaming=True,
+        callbacks=[cb],
+        temperature=0.6,
+    )
 
-    # Chat-format prompt 생성
+    # 3) 프롬프트
     messages = [
-        {
-            "role": "system",
-            "content": (
-                "당신은 한국어 문서를 간결하고 핵심적으로 요약하는 전문가입니다. "
-                "요약 외에는 절대 다른 말을 하지 마세요."
-            )
-        },
-        {
-            "role": "user",
-            "content": text
-        }
+        SystemMessage(
+            content="다음 텍스트를 한국어로 간결하게 요약하세요. "
+                    "사고 과정(Chain‑of‑Thought)은 절대 출력하지 마세요./no_think"
+        ),
+        HumanMessage(content=text),
     ]
 
-    # tokenizer.apply_chat_template()를 통해 모델 친화적인 프롬프트 생성
-    prompt = _tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False
-    )
-
-    # 입력 토크나이즈 후 모델 디바이스로 이동
-    inputs = _tokenizer(prompt, return_tensors="pt").to(_model.device)
+    # 4) LLM 호출 비동기 실행
+    task = asyncio.create_task(llm.agenerate([messages]))
 
-    # 모델 generate 호출
-    outputs = _model.generate(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=0.95,
-        top_k=20,
-        do_sample=False,              # 안정적인 요약을 위해 샘플링 끄기
-        eos_token_id=_tokenizer.eos_token_id
-    )
+    buffer = ""
+    async for token in cb.aiter():
+        buffer += token
+        if buffer.endswith(("\n", "。", ".", "…")):
+            line = buffer.strip()
+            buffer = ""
 
-    # 입력 프롬프트 뒤에 생성된 토큰만 디코딩
-    gen_tokens = outputs[0].tolist()[len(inputs.input_ids[0]):]
-    decoded = _tokenizer.decode(gen_tokens, skip_special_tokens=True)
+            if not _THOUGHT_PAT.match(line):
+                yield f"data: {line}\n\n"          # SSE 청크 전송
 
-    return decoded.strip()
+    await task  # 예외 전파