File size: 5,871 Bytes
0460893 682dbc2 0460893 a5d053e 0460893 25db7d4 a5d053e 0460893 6d3e380 0460893 39d84f4 0460893 ed74ce8 0460893 ed74ce8 0460893 682dbc2 0460893 39d84f4 0460893 682dbc2 0460893 a5d053e 0460893 a5d053e c786b95 ed74ce8 682dbc2 0460893 ed74ce8 0460893 ed74ce8 a5d053e 8785d5b 0460893 682dbc2 0460893 682dbc2 0460893 ed74ce8 0460893 ed74ce8 0460893 ed74ce8 2364e8e ed74ce8 0460893 ed74ce8 99510b1 ed74ce8 0460893 ed74ce8 0460893 a5d053e 0460893 a5d053e 0460893 682dbc2 7472731 e674cd2 0460893 e674cd2 7472731 e674cd2 0460893 e674cd2 7472731 0460893 99510b1 0460893 99510b1 0460893 7472731 e674cd2 0460893 e674cd2 7472731 e674cd2 0460893 e674cd2 7472731 0460893 682dbc2 0460893 35d3d26 682dbc2 3400cca ed74ce8 0460893 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# =========================
# CAMEL-DOC-OCR (HF Spaces SAFE)
# Single-file – NO CUDA init at global scope
# =========================
import os
import gc
import torch
import fitz
import gradio as gr
import spaces
from PIL import Image
from transformers import AutoProcessor, BitsAndBytesConfig
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
# =========================
# CONFIG
# =========================
MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
DPI = 150
MAX_IMAGE_SIZE = 2048
# =========================
# TORCH FLAGS (SAFE FOR SPACES)
# =========================
torch.set_grad_enabled(False)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# =========================
# LOAD MODEL (NO CUDA INIT HERE)
# =========================
bnb = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
processor = AutoProcessor.from_pretrained(
MODEL_ID,
trust_remote_code=True
)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID,
quantization_config=bnb,
device_map="auto", # HF Spaces will inject GPU here
torch_dtype=torch.float16,
trust_remote_code=True
).eval()
processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
# =========================
# PDF → IMAGE (FAST & SAFE)
# =========================
def pdf_to_images(pdf_bytes):
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
scale = DPI / 72.0
mat = fitz.Matrix(scale, scale)
for page in doc:
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
if max(img.size) > MAX_IMAGE_SIZE:
img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)
images.append(img)
return images
# =========================
# OCR INFERENCE (CUDA ONLY HERE)
# =========================
@spaces.GPU
def run_inference(image, prompt, max_new_tokens):
if image.mode != "RGB":
image = image.convert("RGB")
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt}
]
}]
text_prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=[text_prompt],
images=[image],
return_tensors="pt",
truncation=False, # 🔴 BẮT BUỘC
padding="longest" # 🔴 BẮT BUỘC
).to(model.device)
with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
eos_token_id=processor.tokenizer.eos_token_id
)
outputs = outputs[:, inputs["input_ids"].shape[1]:]
return processor.tokenizer.decode(
outputs[0],
skip_special_tokens=True,
clean_up_tokenization_spaces=True
).strip()
# =========================
# FILE HANDLER
# =========================
def handle_file(file, prompt, max_new_tokens, progress=gr.Progress()):
file_path = file.name
ext = file_path.lower().split(".")[-1]
prompt = prompt.strip()
if ext == "pdf":
with open(file_path, "rb") as f:
images = pdf_to_images(f.read())
results = []
for i, img in enumerate(images):
text = run_inference(img, prompt, max_new_tokens)
results.append(text)
progress((i + 1) / len(images), desc=f"Page {i+1}/{len(images)}")
return "\n\n--- PAGE BREAK ---\n\n".join(results)
else:
img = Image.open(file_path)
return run_inference(img, prompt, max_new_tokens)
# =========================
# DEFAULT PROMPT (CAMEL OCR)
# =========================
DEFAULT_PROMPT = """
You are an OCR + Information Extraction engine.
Extract data strictly from the document.
Return JSON ONLY. NO explanation.
OUTPUT FORMAT:
{
"price": "",
"vat": "",
"invoiceNo": "",
"invoiceDate": "",
"billingToTaxCode": "",
"accountingObjectTaxCode": "",
"description": ""
}
""".strip()
# =========================
# GRADIO UI
# =========================
with gr.Blocks(title="Camel-Doc-OCR") as demo:
gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL – 4bit, HF Spaces Safe)")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Image / PDF",
file_types=[".jpg", ".jpeg", ".png", ".pdf"]
)
prompt_input = gr.Textbox(
label="Prompt",
value=DEFAULT_PROMPT,
lines=10
)
max_tokens = gr.Radio(
[256, 512, 1024, 2048],
value=512,
label="Max new tokens"
)
run_btn = gr.Button("🚀 Run OCR", variant="primary")
with gr.Column(scale=1):
output = gr.Textbox(
label="Result",
lines=20
)
run_btn.click(
fn=handle_file,
inputs=[file_input, prompt_input, max_tokens],
outputs=output
)
# =========================
# CLEANUP
# =========================
def cleanup():
torch.cuda.empty_cache()
gc.collect()
# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)
|