Skip to content

Commit 009bf53

Browse files
committed
0914 01:00
1 parent 7598009 commit 009bf53

File tree

6 files changed

+594
-103
lines changed

6 files changed

+594
-103
lines changed

requirements.txt

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,31 @@ uvicorn
33
pydantic
44
sqlalchemy
55
mysql-connector-python
6-
dotenv
6+
python-dotenv
77
google-auth
88
requests
99
python-jose[cryptography]
1010
bcrypt
1111

12-
torch==2.3.0+cu121
13-
torchaudio==2.3.0+cu121
14-
torchvision==0.18.0+cu121
15-
--extra-index-url https://download.pytorch.org/whl/cu121
12+
# PyTorch (MacOS: CPU/MPS 빌드 자동 설치됨)
13+
torch==2.3.0
14+
torchvision==0.18.0
15+
torchaudio==2.3.0
1616

1717
transformers>=4.40.0
1818
accelerate
1919
sentencepiece
2020
protobuf
2121
python-multipart
2222
easyocr
23-
whisper
23+
whisper
24+
pytesseract
25+
pdf2image
26+
PyMuPDF
27+
python-docx
28+
29+
langchain>=0.2.0
30+
langchain-community
31+
langchain-core
32+
langchain-openai
33+
langchain-ollama

routers/file.py

Lines changed: 128 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,29 @@
11
# ~/noteflow/Backend/routers/file.py
22

33
import os
4-
import io
5-
import whisper
6-
model = whisper.load_model("base")
74
from datetime import datetime
8-
import numpy as np
95
from typing import Optional, List
10-
from urllib.parse import quote
116

12-
from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, status
7+
from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, status, Query, Response
138
from fastapi.responses import FileResponse
149
from sqlalchemy.orm import Session
15-
from PIL import Image
1610

1711
from db import get_db
1812
from models.file import File as FileModel
1913
from models.note import Note as NoteModel
2014
from utils.jwt_utils import get_current_user
2115

22-
# -------------------------------
23-
# 1) EasyOCR 라이브러리 임포트 (GPU 모드 활성화)
24-
# -------------------------------
25-
import easyocr
26-
reader = easyocr.Reader(["ko", "en"], gpu=True)
27-
28-
# -------------------------------
29-
# 2) Hugging Face TrOCR 모델용 파이프라인 (GPU 사용)
30-
# -------------------------------
31-
from transformers import pipeline
32-
33-
hf_trocr_printed = pipeline(
34-
"image-to-text",
35-
model="microsoft/trocr-base-printed",
36-
device=0,
37-
trust_remote_code=True
38-
)
39-
hf_trocr_handwritten = pipeline(
40-
"image-to-text",
41-
model="microsoft/trocr-base-handwritten",
42-
device=0,
43-
trust_remote_code=True
44-
)
45-
hf_trocr_small_printed = pipeline(
46-
"image-to-text",
47-
model="microsoft/trocr-small-printed",
48-
device=0,
49-
trust_remote_code=True
50-
)
51-
hf_trocr_large_printed = pipeline(
52-
"image-to-text",
53-
model="microsoft/trocr-large-printed",
54-
device=0,
55-
trust_remote_code=True
16+
# 추가/변경: 공통 OCR 파이프라인(thin wrapper)
17+
from utils.ocr import run_pipeline, detect_type
18+
from schemas.file import OCRResponse
19+
20+
# 추가: 허용 확장자 상수 (불일치 시 200 + warnings 응답)
21+
ALLOWED_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}
22+
ALLOWED_PDF_EXTS = {".pdf"}
23+
ALLOWED_DOC_EXTS = {".doc", ".docx"}
24+
ALLOWED_HWP_EXTS = {".hwp"}
25+
ALLOWED_ALL_EXTS = (
26+
ALLOWED_IMAGE_EXTS | ALLOWED_PDF_EXTS | ALLOWED_DOC_EXTS | ALLOWED_HWP_EXTS
5627
)
5728

5829
# 업로드 디렉토리 설정
@@ -65,6 +36,38 @@
6536

6637
router = APIRouter(prefix="/api/v1/files", tags=["Files"])
6738

39+
@router.get("/ocr/diag", summary="OCR 런타임 의존성 진단")
40+
def ocr_dependency_diag():
41+
import shutil, subprocess
42+
def which(cmd: str):
43+
return shutil.which(cmd) is not None
44+
def run(cmd: list[str]):
45+
try:
46+
out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=5)
47+
return out.decode(errors="ignore").strip()
48+
except Exception as e:
49+
return f"ERR: {e}"
50+
51+
tesseract_ok = which("tesseract")
52+
poppler_ok = which("pdftoppm") or which("pdftocairo")
53+
soffice_ok = which("soffice") or which("libreoffice")
54+
hwp5txt_ok = which("hwp5txt")
55+
56+
langs = None
57+
tess_ver = None
58+
if tesseract_ok:
59+
tess_ver = run(["tesseract", "--version"]).splitlines()[0] if tesseract_ok else None
60+
langs_out = run(["tesseract", "--list-langs"])
61+
langs = [l.strip() for l in langs_out.splitlines() if l and not l.lower().startswith("list of available")] if langs_out and not langs_out.startswith("ERR:") else None
62+
63+
return {
64+
"tesseract": tesseract_ok,
65+
"tesseract_version": tess_ver,
66+
"tesseract_langs": langs,
67+
"poppler": poppler_ok,
68+
"libreoffice": soffice_ok,
69+
"hwp5txt": hwp5txt_ok,
70+
}
6871

6972
@router.post(
7073
"/upload",
@@ -194,73 +197,94 @@ def download_file(
194197

195198
@router.post(
196199
"/ocr",
197-
summary="이미지 OCR → 텍스트 변환 후 노트 생성",
198-
response_model=dict
200+
summary="이미지/PDF/DOC/DOCX/HWP OCR → 텍스트 변환 후 노트 생성",
201+
response_model=OCRResponse
199202
)
200203
async def ocr_and_create_note(
201-
ocr_file: UploadFile = File(...),
204+
# 변경: 업로드 필드명 'file' 기본 + 과거 호환 'ocr_file' 동시 허용
205+
file: Optional[UploadFile] = File(None, description="기본 업로드 필드명"),
206+
ocr_file: Optional[UploadFile] = File(None, description="과거 호환 업로드 필드명"),
202207
folder_id: Optional[int] = Form(None),
208+
langs: str = Query("kor+eng", description="Tesseract 언어코드(예: kor+eng)"),
209+
max_pages: int = Query(50, ge=1, le=500, description="최대 처리 페이지 수(기본 50)"),
203210
db: Session = Depends(get_db),
204211
current_user = Depends(get_current_user)
205212
):
206213
"""
207-
• EasyOCR + TrOCR 모델로 이미지에서 텍스트 추출
208-
• 가장 긴 결과를 선택해 새 노트로 저장
214+
변경 전: 이미지 전용 EasyOCR/TrOCR로 텍스트 추출 후 노트 생성.
215+
변경 후(추가/변경): 공통 파이프라인(utils.ocr.run_pipeline)으로 이미지/PDF/DOC/DOCX/HWP 처리.
216+
- 예외는 200으로 내려가며, results=[] + warnings에 사유 기입.
217+
- 결과 텍스트를 합쳐 비어있지 않으면 기존과 동일하게 노트를 생성.
209218
"""
210-
# 1) 이미지 로드
211-
contents = await ocr_file.read()
212-
try:
213-
image = Image.open(io.BytesIO(contents)).convert("RGB")
214-
except Exception as e:
215-
raise HTTPException(status_code=400, detail=f"이미지 처리 실패: {e}")
219+
# 업로드 파일 결정
220+
upload = file or ocr_file
221+
if upload is None:
222+
raise HTTPException(status_code=400, detail="업로드 파일이 필요합니다. 필드명은 'file' 또는 'ocr_file'을 사용하세요.")
223+
224+
filename = upload.filename or "uploaded"
225+
mime = upload.content_type
226+
227+
# 허용 확장자 확인 (불일치 시 200 + warnings)
228+
_, ext = os.path.splitext(filename)
229+
ext = ext.lower()
230+
if ext and ext not in ALLOWED_ALL_EXTS:
231+
return OCRResponse(
232+
filename=filename,
233+
mime=mime,
234+
page_count=0,
235+
results=[],
236+
warnings=[f"허용되지 않는 확장자({ext}). 허용: {sorted(ALLOWED_ALL_EXTS)}"],
237+
note_id=None,
238+
text=None,
239+
)
216240

217-
# 2) EasyOCR
218-
try:
219-
image_np = np.array(image)
220-
easy_results = reader.readtext(image_np)
221-
easy_text = " ".join([res[1] for res in easy_results])
222-
except Exception:
223-
easy_text = ""
224-
225-
# 3) TrOCR 4개 모델
226-
hf_texts: List[str] = []
227-
try:
228-
for pipe in (
229-
hf_trocr_printed,
230-
hf_trocr_handwritten,
231-
hf_trocr_small_printed,
232-
hf_trocr_large_printed
233-
):
234-
out = pipe(image)
235-
if isinstance(out, list) and "generated_text" in out[0]:
236-
hf_texts.append(out[0]["generated_text"].strip())
237-
except Exception:
238-
pass
239-
240-
# 4) 가장 긴 결과 선택
241-
candidates = [t for t in [easy_text] + hf_texts if t and t.strip()]
242-
if not candidates:
243-
raise HTTPException(status_code=500, detail="텍스트를 인식할 수 없습니다.")
244-
ocr_text = max(candidates, key=len)
245-
246-
# 5) Note 생성
247-
try:
248-
new_note = NoteModel(
249-
user_id=current_user.u_id,
250-
folder_id=folder_id,
251-
title="OCR 결과",
252-
content=ocr_text
241+
# 타입 판별 (보조적으로 unknown 방지)
242+
ftype = detect_type(filename, mime)
243+
if ftype == "unknown":
244+
return OCRResponse(
245+
filename=filename,
246+
mime=mime,
247+
page_count=0,
248+
results=[],
249+
warnings=["지원되지 않는 파일 형식입니다."],
250+
note_id=None,
251+
text=None,
253252
)
254-
db.add(new_note)
255-
db.commit()
256-
db.refresh(new_note)
257-
except Exception as e:
258-
raise HTTPException(status_code=500, detail=f"노트 저장 실패: {e}")
259253

260-
return {
261-
"note_id": new_note.id,
262-
"text": ocr_text
263-
}
254+
data = await upload.read()
255+
256+
pipe = run_pipeline(
257+
filename=filename,
258+
mime=mime,
259+
data=data,
260+
langs=langs,
261+
max_pages=max_pages,
262+
)
263+
264+
merged_text = "\n\n".join([
265+
item.get("text", "") for item in (pipe.get("results") or []) if item.get("text")
266+
]).strip()
267+
268+
note_id: Optional[int] = None
269+
if merged_text:
270+
try:
271+
new_note = NoteModel(
272+
user_id=current_user.u_id,
273+
folder_id=folder_id,
274+
title="OCR 결과",
275+
content=merged_text,
276+
)
277+
db.add(new_note)
278+
db.commit()
279+
db.refresh(new_note)
280+
note_id = new_note.id
281+
except Exception as e:
282+
(pipe.setdefault("warnings", [])).append(f"노트 저장 실패: {e}")
283+
284+
pipe["note_id"] = note_id
285+
pipe["text"] = merged_text or None
286+
287+
return pipe
264288

265289

266290
@router.post("/audio")
@@ -338,3 +362,10 @@ async def upload_audio_and_transcribe(
338362
"message": "STT 및 노트 저장 완료",
339363
"transcript": transcript
340364
}
365+
@router.options("/ocr")
366+
def ocr_cors_preflight() -> Response:
367+
"""CORS preflight용 OPTIONS 응답. 일부 프록시/클라이언트에서 405 회피.
368+
변경 전: 별도 OPTIONS 라우트 없음(미들웨어에 의존)
369+
변경 후(추가): 명시적으로 200을 반환
370+
"""
371+
return Response(status_code=200)

schemas/file.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from __future__ import annotations
2+
3+
from typing import List, Optional
4+
from pydantic import BaseModel, Field
5+
6+
7+
class OCRResultItem(BaseModel):
8+
page: int
9+
text: str
10+
11+
12+
class OCRResponse(BaseModel):
13+
# 신규 필드(추가/변경): 공통 파이프라인 메타
14+
filename: str
15+
mime: Optional[str] = None
16+
page_count: int
17+
results: List[OCRResultItem] = Field(default_factory=list)
18+
warnings: List[str] = Field(default_factory=list)
19+
20+
# 하위 호환(변경 전 응답 유지): 기존 이미지 OCR 응답
21+
note_id: Optional[int] = None
22+
text: Optional[str] = None
23+

utils/ocr/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
utils.ocr 패키지
3+
4+
추가/변경 요약
5+
- 공통 OCR 파이프라인 진입점(run_pipeline)을 외부에 노출
6+
- 이미지/PDF/DOC/DOCX/HWP를 단일 인터페이스로 처리
7+
"""
8+
9+
from .ocr_core import run_pipeline, detect_type
10+
11+
__all__ = [
12+
"run_pipeline",
13+
"detect_type",
14+
]
15+

0 commit comments

Comments
 (0)