Spaces:
Running
Running
save
Browse files- app.py +74 -6
- quantize_huihui_fara.py +33 -6
- quantize_qwen2_5_vl.py +33 -6
- tests/test_app.py +21 -2
app.py
CHANGED
|
@@ -358,19 +358,87 @@ def compress_and_upload(
|
|
| 358 |
try:
|
| 359 |
# Show sub-steps during model loading
|
| 360 |
progress(0.05, desc="Stage 1/5: Determining model class...")
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
progress(0.15, desc="Stage 1/5: Model loaded, loading tokenizer...")
|
| 365 |
except ValueError as e:
|
| 366 |
if "Unrecognized configuration class" in str(e):
|
| 367 |
# If automatic detection fails, fall back to AutoModel and let transformers handle it
|
| 368 |
print(f"Automatic model class detection failed, falling back to AutoModel: {e}")
|
| 369 |
progress(0.05, desc="Stage 1/5: Using fallback model class...")
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
progress(0.15, desc="Stage 1/5: Model loaded with fallback class...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
else:
|
| 375 |
raise
|
| 376 |
|
|
|
|
| 358 |
try:
|
| 359 |
# Show sub-steps during model loading
|
| 360 |
progress(0.05, desc="Stage 1/5: Determining model class...")
|
| 361 |
+
|
| 362 |
+
# Determine the optimal device configuration based on available resources
|
| 363 |
+
if torch.cuda.is_available():
|
| 364 |
+
# If CUDA is available, use auto device mapping to distribute model across available devices
|
| 365 |
+
model = model_class.from_pretrained(
|
| 366 |
+
model_id,
|
| 367 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
|
| 368 |
+
device_map="auto",
|
| 369 |
+
token=token,
|
| 370 |
+
trust_remote_code=True
|
| 371 |
+
)
|
| 372 |
+
else:
|
| 373 |
+
# If no CUDA, load on CPU
|
| 374 |
+
model = model_class.from_pretrained(
|
| 375 |
+
model_id,
|
| 376 |
+
torch_dtype="auto",
|
| 377 |
+
device_map="cpu",
|
| 378 |
+
token=token,
|
| 379 |
+
trust_remote_code=True
|
| 380 |
+
)
|
| 381 |
progress(0.15, desc="Stage 1/5: Model loaded, loading tokenizer...")
|
| 382 |
except ValueError as e:
|
| 383 |
if "Unrecognized configuration class" in str(e):
|
| 384 |
# If automatic detection fails, fall back to AutoModel and let transformers handle it
|
| 385 |
print(f"Automatic model class detection failed, falling back to AutoModel: {e}")
|
| 386 |
progress(0.05, desc="Stage 1/5: Using fallback model class...")
|
| 387 |
+
|
| 388 |
+
if torch.cuda.is_available():
|
| 389 |
+
model = AutoModel.from_pretrained(
|
| 390 |
+
model_id,
|
| 391 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
|
| 392 |
+
device_map="auto",
|
| 393 |
+
token=token,
|
| 394 |
+
trust_remote_code=True
|
| 395 |
+
)
|
| 396 |
+
else:
|
| 397 |
+
model = AutoModel.from_pretrained(
|
| 398 |
+
model_id,
|
| 399 |
+
torch_dtype="auto",
|
| 400 |
+
device_map="cpu",
|
| 401 |
+
token=token,
|
| 402 |
+
trust_remote_code=True
|
| 403 |
+
)
|
| 404 |
progress(0.15, desc="Stage 1/5: Model loaded with fallback class...")
|
| 405 |
+
elif "offload_dir" in str(e):
|
| 406 |
+
# If the error mentions offload_dir, try with disk offloading
|
| 407 |
+
print(f"Model requires offloading, trying with temporary offload directory: {e}")
|
| 408 |
+
progress(0.05, desc="Stage 1/5: Setting up model with offloading...")
|
| 409 |
+
|
| 410 |
+
import tempfile
|
| 411 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 412 |
+
model = model_class.from_pretrained(
|
| 413 |
+
model_id,
|
| 414 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
|
| 415 |
+
device_map="auto",
|
| 416 |
+
offload_folder=temp_dir,
|
| 417 |
+
token=token,
|
| 418 |
+
trust_remote_code=True
|
| 419 |
+
)
|
| 420 |
+
progress(0.15, desc="Stage 1/5: Model loaded with offloading...")
|
| 421 |
+
else:
|
| 422 |
+
raise
|
| 423 |
+
except RuntimeError as e:
|
| 424 |
+
if "out of memory" in str(e).lower() or "offload_dir" in str(e):
|
| 425 |
+
# If there's an out of memory error or offload_dir error, try memory-efficient loading
|
| 426 |
+
print(f"Memory issue detected, trying with CPU offloading: {e}")
|
| 427 |
+
progress(0.05, desc="Stage 1/5: Setting up memory-efficient model loading...")
|
| 428 |
+
|
| 429 |
+
# Use CPU offloading to handle memory constraints
|
| 430 |
+
import tempfile
|
| 431 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 432 |
+
model = model_class.from_pretrained(
|
| 433 |
+
model_id,
|
| 434 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
|
| 435 |
+
device_map="auto",
|
| 436 |
+
offload_folder=temp_dir,
|
| 437 |
+
max_memory={0: "24GB", "cpu": "48GB"}, # Limit GPU memory usage
|
| 438 |
+
token=token,
|
| 439 |
+
trust_remote_code=True
|
| 440 |
+
)
|
| 441 |
+
progress(0.15, desc="Stage 1/5: Model loaded with memory-efficient approach...")
|
| 442 |
else:
|
| 443 |
raise
|
| 444 |
|
quantize_huihui_fara.py
CHANGED
|
@@ -191,12 +191,39 @@ def quantize_huihui_fara_model(
|
|
| 191 |
Quantized model
|
| 192 |
"""
|
| 193 |
print(f"Loading model: {model_id}")
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
print(f"Loading processor for: {model_id}")
|
| 202 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
|
|
|
| 191 |
Quantized model
|
| 192 |
"""
|
| 193 |
print(f"Loading model: {model_id}")
|
| 194 |
+
|
| 195 |
+
# Handle different device scenarios properly
|
| 196 |
+
if torch.cuda.is_available():
|
| 197 |
+
try:
|
| 198 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 199 |
+
model_id,
|
| 200 |
+
torch_dtype=torch.float16, # Use float16 to save memory
|
| 201 |
+
device_map="auto", # Auto device mapping for memory efficiency
|
| 202 |
+
trust_remote_code=trust_remote_code
|
| 203 |
+
)
|
| 204 |
+
except RuntimeError as e:
|
| 205 |
+
if "out of memory" in str(e).lower() or "offload_dir" in str(e):
|
| 206 |
+
print(f"Memory issue detected, using offloading: {e}")
|
| 207 |
+
import tempfile
|
| 208 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 209 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 210 |
+
model_id,
|
| 211 |
+
torch_dtype=torch.float16,
|
| 212 |
+
device_map="auto",
|
| 213 |
+
offload_folder=temp_dir,
|
| 214 |
+
max_memory={0: "24GB", "cpu": "48GB"},
|
| 215 |
+
trust_remote_code=trust_remote_code
|
| 216 |
+
)
|
| 217 |
+
else:
|
| 218 |
+
raise
|
| 219 |
+
else:
|
| 220 |
+
# CPU only
|
| 221 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 222 |
+
model_id,
|
| 223 |
+
torch_dtype=torch.float32, # Use float32 on CPU
|
| 224 |
+
device_map="cpu",
|
| 225 |
+
trust_remote_code=trust_remote_code
|
| 226 |
+
)
|
| 227 |
|
| 228 |
print(f"Loading processor for: {model_id}")
|
| 229 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
quantize_qwen2_5_vl.py
CHANGED
|
@@ -189,12 +189,39 @@ def quantize_qwen2_5_vl_model(
|
|
| 189 |
Quantized model
|
| 190 |
"""
|
| 191 |
print(f"Loading model: {model_id}")
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
print(f"Loading processor for: {model_id}")
|
| 200 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
|
|
|
| 189 |
Quantized model
|
| 190 |
"""
|
| 191 |
print(f"Loading model: {model_id}")
|
| 192 |
+
|
| 193 |
+
# Handle different device scenarios properly
|
| 194 |
+
if torch.cuda.is_available():
|
| 195 |
+
try:
|
| 196 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 197 |
+
model_id,
|
| 198 |
+
torch_dtype=torch.float16, # Use float16 to save memory on GPU
|
| 199 |
+
device_map="auto", # Auto device mapping for memory efficiency
|
| 200 |
+
trust_remote_code=trust_remote_code
|
| 201 |
+
)
|
| 202 |
+
except RuntimeError as e:
|
| 203 |
+
if "out of memory" in str(e).lower() or "offload_dir" in str(e):
|
| 204 |
+
print(f"Memory issue detected, using offloading: {e}")
|
| 205 |
+
import tempfile
|
| 206 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 207 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 208 |
+
model_id,
|
| 209 |
+
torch_dtype=torch.float16,
|
| 210 |
+
device_map="auto",
|
| 211 |
+
offload_folder=temp_dir,
|
| 212 |
+
max_memory={0: "24GB", "cpu": "48GB"},
|
| 213 |
+
trust_remote_code=trust_remote_code
|
| 214 |
+
)
|
| 215 |
+
else:
|
| 216 |
+
raise
|
| 217 |
+
else:
|
| 218 |
+
# CPU only
|
| 219 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 220 |
+
model_id,
|
| 221 |
+
torch_dtype=torch.float32, # Use float32 on CPU
|
| 222 |
+
device_map="cpu",
|
| 223 |
+
trust_remote_code=trust_remote_code
|
| 224 |
+
)
|
| 225 |
|
| 226 |
print(f"Loading processor for: {model_id}")
|
| 227 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
tests/test_app.py
CHANGED
|
@@ -111,8 +111,18 @@ def test_compress_and_upload_success(
|
|
| 111 |
result = compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
|
| 112 |
|
| 113 |
mock_whoami.assert_called_once_with(token="test_token")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
|
| 115 |
-
model_id, torch_dtype=
|
| 116 |
)
|
| 117 |
mock_oneshot.assert_called_once()
|
| 118 |
assert mock_oneshot.call_args[1]["model"] == mock_auto_model_for_causal_lm.from_pretrained.return_value
|
|
@@ -148,8 +158,17 @@ def test_compress_and_upload_with_trust_remote_code(
|
|
| 148 |
model_type_selection = "Auto-detect (recommended)"
|
| 149 |
compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
|
| 152 |
-
model_id, torch_dtype=
|
| 153 |
)
|
| 154 |
|
| 155 |
|
|
|
|
| 111 |
result = compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
|
| 112 |
|
| 113 |
mock_whoami.assert_called_once_with(token="test_token")
|
| 114 |
+
|
| 115 |
+
# The device_map and torch_dtype should depend on CUDA availability
|
| 116 |
+
import torch
|
| 117 |
+
if torch.cuda.is_available():
|
| 118 |
+
expected_torch_dtype = torch.float16
|
| 119 |
+
expected_device_map = "auto"
|
| 120 |
+
else:
|
| 121 |
+
expected_torch_dtype = "auto"
|
| 122 |
+
expected_device_map = "cpu"
|
| 123 |
+
|
| 124 |
mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
|
| 125 |
+
model_id, torch_dtype=expected_torch_dtype, device_map=expected_device_map, token="test_token", trust_remote_code=True
|
| 126 |
)
|
| 127 |
mock_oneshot.assert_called_once()
|
| 128 |
assert mock_oneshot.call_args[1]["model"] == mock_auto_model_for_causal_lm.from_pretrained.return_value
|
|
|
|
| 158 |
model_type_selection = "Auto-detect (recommended)"
|
| 159 |
compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
|
| 160 |
|
| 161 |
+
# The device_map and torch_dtype should depend on CUDA availability
|
| 162 |
+
import torch
|
| 163 |
+
if torch.cuda.is_available():
|
| 164 |
+
expected_torch_dtype = torch.float16
|
| 165 |
+
expected_device_map = "auto"
|
| 166 |
+
else:
|
| 167 |
+
expected_torch_dtype = "auto"
|
| 168 |
+
expected_device_map = "cpu"
|
| 169 |
+
|
| 170 |
mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
|
| 171 |
+
model_id, torch_dtype=expected_torch_dtype, device_map=expected_device_map, token="test_token", trust_remote_code=True
|
| 172 |
)
|
| 173 |
|
| 174 |
|