Spaces:

vithacocf
/

ocr

Running on Zero

ocr

File size: 5,871 Bytes

# =========================
# CAMEL-DOC-OCR (HF Spaces SAFE)
# Single-file – NO CUDA init at global scope
# =========================

import os
import gc
import torch
import fitz
import gradio as gr
import spaces
from PIL import Image

from transformers import AutoProcessor, BitsAndBytesConfig
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration


# =========================
# CONFIG
# =========================
MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
DPI = 150
MAX_IMAGE_SIZE = 2048


# =========================
# TORCH FLAGS (SAFE FOR SPACES)
# =========================
torch.set_grad_enabled(False)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True


# =========================
# LOAD MODEL (NO CUDA INIT HERE)
# =========================
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb,
    device_map="auto",            # HF Spaces will inject GPU here
    torch_dtype=torch.float16,
    trust_remote_code=True
).eval()

processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id


# =========================
# PDF → IMAGE (FAST & SAFE)
# =========================
def pdf_to_images(pdf_bytes):
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    images = []

    scale = DPI / 72.0
    mat = fitz.Matrix(scale, scale)

    for page in doc:
        pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        if max(img.size) > MAX_IMAGE_SIZE:
            img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)

        images.append(img)

    return images


# =========================
# OCR INFERENCE (CUDA ONLY HERE)
# =========================
@spaces.GPU
def run_inference(image, prompt, max_new_tokens):
    if image.mode != "RGB":
        image = image.convert("RGB")

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]

    text_prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = processor(
                            text=[text_prompt],
                            images=[image],
                            return_tensors="pt",
                            truncation=False,        # 🔴 BẮT BUỘC
                            padding="longest"        # 🔴 BẮT BUỘC
                        ).to(model.device)

    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            use_cache=True,
            eos_token_id=processor.tokenizer.eos_token_id
        )

    outputs = outputs[:, inputs["input_ids"].shape[1]:]

    return processor.tokenizer.decode(
        outputs[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    ).strip()


# =========================
# FILE HANDLER
# =========================
def handle_file(file, prompt, max_new_tokens, progress=gr.Progress()):
    file_path = file.name
    ext = file_path.lower().split(".")[-1]
    prompt = prompt.strip()

    if ext == "pdf":
        with open(file_path, "rb") as f:
            images = pdf_to_images(f.read())

        results = []
        for i, img in enumerate(images):
            text = run_inference(img, prompt, max_new_tokens)
            results.append(text)
            progress((i + 1) / len(images), desc=f"Page {i+1}/{len(images)}")

        return "\n\n--- PAGE BREAK ---\n\n".join(results)

    else:
        img = Image.open(file_path)
        return run_inference(img, prompt, max_new_tokens)


# =========================
# DEFAULT PROMPT (CAMEL OCR)
# =========================
DEFAULT_PROMPT = """
You are an OCR + Information Extraction engine.
Extract data strictly from the document.
Return JSON ONLY. NO explanation.

OUTPUT FORMAT:
{
  "price": "",
  "vat": "",
  "invoiceNo": "",
  "invoiceDate": "",
  "billingToTaxCode": "",
  "accountingObjectTaxCode": "",
  "description": ""
}
""".strip()


# =========================
# GRADIO UI
# =========================
with gr.Blocks(title="Camel-Doc-OCR") as demo:
    gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL – 4bit, HF Spaces Safe)")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(
                label="Upload Image / PDF",
                file_types=[".jpg", ".jpeg", ".png", ".pdf"]
            )

            prompt_input = gr.Textbox(
                label="Prompt",
                value=DEFAULT_PROMPT,
                lines=10
            )

            max_tokens = gr.Radio(
                [256, 512, 1024, 2048],
                value=512,
                label="Max new tokens"
            )

            run_btn = gr.Button("🚀 Run OCR", variant="primary")

        with gr.Column(scale=1):
            output = gr.Textbox(
                label="Result",
                lines=20
            )

    run_btn.click(
        fn=handle_file,
        inputs=[file_input, prompt_input, max_tokens],
        outputs=output
    )


# =========================
# CLEANUP
# =========================
def cleanup():
    torch.cuda.empty_cache()
    gc.collect()


# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )