Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
.env*
.env*

# Ignore virtual environment
venv/
env/
20 changes: 20 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# 1. Use an official, lightweight Python base image
FROM python:3.9-slim

# 2. Set a working directory inside the container
WORKDIR /app

# 3. Copy the requirements file into the container
COPY requirements.txt .

# 4. Install the Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# 5. Copy the rest of your application code into the container
COPY . .

# 6. Expose the port the app will run on
EXPOSE 8080

# 7. Define the command to start your Uvicorn server
CMD ["uvicorn", "scan:app", "--host", "0.0.0.0", "--port", "8080"]
Binary file added PiedraSantaSteakDinner.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Receipt Reader
This project is designed to take a receipt image and convert it into text tokens.

## Getting Started
Follow these instructions to set up the project on your local machine for development and testing.

## Prerequisites
- Python 3.8 or higher
- pip (Python package installer)

## Installation

Create and Activate a Virtual Environment. It is highly recommended to use a virtual environment to manage project-specific dependencies.

On macOS and Linux:

### Create the environment
```
python3 -m venv venv
```

### Activate the environment
```
source venv/bin/activate
```

Your terminal prompt should now be prefixed with (venv), indicating that the environment is active.

### Install Dependencies

With your virtual environment active, install the required packages using the requirements.txt file:

```
pip install -r requirements.txt
```

This command will download and install all the necessary libraries for the project to run correctly.

Usage
To run the main script, execute the following command from the root directory of the project:
```
python scan.py
```
Make sure your virtual environment is active when you run the script.

61 changes: 61 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
annotated-types==0.7.0
anyio==4.9.0
certifi==2025.6.15
charset-normalizer==3.4.2
click==8.1.8
dnspython==2.7.0
email_validator==2.2.0
exceptiongroup==1.3.0
fastapi==0.115.14
fastapi-cli==0.0.7
filelock==3.18.0
fsspec==2025.5.1
h11==0.16.0
hf-xet==1.1.5
httpcore==1.0.9
httptools==0.6.4
httpx==0.28.1
huggingface-hub==0.33.2
idna==3.10
itsdangerous==2.2.0
Jinja2==3.1.6
markdown-it-py==3.0.0
MarkupSafe==3.0.2
mdurl==0.1.2
mpmath==1.3.0
networkx==3.2.1
numpy==2.0.2
orjson==3.10.18
packaging==25.0
pillow==11.3.0
pydantic==2.11.7
pydantic-extra-types==2.10.5
pydantic-settings==2.10.1
pydantic_core==2.33.2
Pygments==2.19.2
python-dotenv==1.1.1
python-multipart==0.0.20
PyYAML==6.0.2
regex==2024.11.6
requests==2.32.4
rich==14.0.0
rich-toolkit==0.14.8
safetensors==0.5.3
sentencepiece==0.2.0
shellingham==1.5.4
sniffio==1.3.1
starlette==0.46.2
sympy==1.14.0
tokenizers==0.21.2
torch==2.7.1
tqdm==4.67.1
transformers==4.53.0
typer==0.16.0
typing-inspection==0.4.1
typing_extensions==4.14.0
ujson==5.10.0
urllib3==2.5.0
uvicorn==0.35.0
uvloop==0.21.0
watchfiles==1.1.0
websockets==15.0.1
73 changes: 45 additions & 28 deletions scan.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,58 @@
from fastapi import FastAPI, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch
import re
from PIL import Image
import json
import io

# Initialize the app
app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)

# Load the model and processor (this happens only once when the server starts)
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2", use_fast=False)
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

# define image

image = Image.open("DOLLARAMA.png").convert("RGB")
pixel_values = processor(image, return_tensors="pt").pixel_values
print(pixel_values.shape)


task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

outputs = model.generate(pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
output_scores=True,)

sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
print(sequence)

print(json.dumps(processor.token2json(sequence), indent=4))
@app.post("/process-receipt/")
async def process_receipt(file: UploadFile = File(...)):
# 1. Read the image from the uploaded file
image_data = await file.read()
image = Image.open(io.BytesIO(image_data)).convert("RGB")

# 2. Process the image using the Donut model (your script's logic)
pixel_values = processor(image, return_tensors="pt").pixel_values
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

outputs = model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
output_scores=True,
)

sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()

# 3. Return the structured JSON
return processor.token2json(sequence)