n00b001 commited on
Commit
c2bdc87
·
unverified ·
1 Parent(s): f58eadc
Files changed (4) hide show
  1. app.py +74 -6
  2. quantize_huihui_fara.py +33 -6
  3. quantize_qwen2_5_vl.py +33 -6
  4. tests/test_app.py +21 -2
app.py CHANGED
@@ -358,19 +358,87 @@ def compress_and_upload(
358
  try:
359
  # Show sub-steps during model loading
360
  progress(0.05, desc="Stage 1/5: Determining model class...")
361
- model = model_class.from_pretrained(
362
- model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True
363
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  progress(0.15, desc="Stage 1/5: Model loaded, loading tokenizer...")
365
  except ValueError as e:
366
  if "Unrecognized configuration class" in str(e):
367
  # If automatic detection fails, fall back to AutoModel and let transformers handle it
368
  print(f"Automatic model class detection failed, falling back to AutoModel: {e}")
369
  progress(0.05, desc="Stage 1/5: Using fallback model class...")
370
- model = AutoModel.from_pretrained(
371
- model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True
372
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  progress(0.15, desc="Stage 1/5: Model loaded with fallback class...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  else:
375
  raise
376
 
 
358
  try:
359
  # Show sub-steps during model loading
360
  progress(0.05, desc="Stage 1/5: Determining model class...")
361
+
362
+ # Determine the optimal device configuration based on available resources
363
+ if torch.cuda.is_available():
364
+ # If CUDA is available, use auto device mapping to distribute model across available devices
365
+ model = model_class.from_pretrained(
366
+ model_id,
367
+ torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
368
+ device_map="auto",
369
+ token=token,
370
+ trust_remote_code=True
371
+ )
372
+ else:
373
+ # If no CUDA, load on CPU
374
+ model = model_class.from_pretrained(
375
+ model_id,
376
+ torch_dtype="auto",
377
+ device_map="cpu",
378
+ token=token,
379
+ trust_remote_code=True
380
+ )
381
  progress(0.15, desc="Stage 1/5: Model loaded, loading tokenizer...")
382
  except ValueError as e:
383
  if "Unrecognized configuration class" in str(e):
384
  # If automatic detection fails, fall back to AutoModel and let transformers handle it
385
  print(f"Automatic model class detection failed, falling back to AutoModel: {e}")
386
  progress(0.05, desc="Stage 1/5: Using fallback model class...")
387
+
388
+ if torch.cuda.is_available():
389
+ model = AutoModel.from_pretrained(
390
+ model_id,
391
+ torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
392
+ device_map="auto",
393
+ token=token,
394
+ trust_remote_code=True
395
+ )
396
+ else:
397
+ model = AutoModel.from_pretrained(
398
+ model_id,
399
+ torch_dtype="auto",
400
+ device_map="cpu",
401
+ token=token,
402
+ trust_remote_code=True
403
+ )
404
  progress(0.15, desc="Stage 1/5: Model loaded with fallback class...")
405
+ elif "offload_dir" in str(e):
406
+ # If the error mentions offload_dir, try with disk offloading
407
+ print(f"Model requires offloading, trying with temporary offload directory: {e}")
408
+ progress(0.05, desc="Stage 1/5: Setting up model with offloading...")
409
+
410
+ import tempfile
411
+ with tempfile.TemporaryDirectory() as temp_dir:
412
+ model = model_class.from_pretrained(
413
+ model_id,
414
+ torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
415
+ device_map="auto",
416
+ offload_folder=temp_dir,
417
+ token=token,
418
+ trust_remote_code=True
419
+ )
420
+ progress(0.15, desc="Stage 1/5: Model loaded with offloading...")
421
+ else:
422
+ raise
423
+ except RuntimeError as e:
424
+ if "out of memory" in str(e).lower() or "offload_dir" in str(e):
425
+ # If there's an out of memory error or offload_dir error, try memory-efficient loading
426
+ print(f"Memory issue detected, trying with CPU offloading: {e}")
427
+ progress(0.05, desc="Stage 1/5: Setting up memory-efficient model loading...")
428
+
429
+ # Use CPU offloading to handle memory constraints
430
+ import tempfile
431
+ with tempfile.TemporaryDirectory() as temp_dir:
432
+ model = model_class.from_pretrained(
433
+ model_id,
434
+ torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
435
+ device_map="auto",
436
+ offload_folder=temp_dir,
437
+ max_memory={0: "24GB", "cpu": "48GB"}, # Limit GPU memory usage
438
+ token=token,
439
+ trust_remote_code=True
440
+ )
441
+ progress(0.15, desc="Stage 1/5: Model loaded with memory-efficient approach...")
442
  else:
443
  raise
444
 
quantize_huihui_fara.py CHANGED
@@ -191,12 +191,39 @@ def quantize_huihui_fara_model(
191
  Quantized model
192
  """
193
  print(f"Loading model: {model_id}")
194
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
195
- model_id,
196
- torch_dtype=torch.float16, # Use float16 to save memory
197
- device_map="auto", # Auto device mapping for memory efficiency
198
- trust_remote_code=trust_remote_code
199
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  print(f"Loading processor for: {model_id}")
202
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
 
191
  Quantized model
192
  """
193
  print(f"Loading model: {model_id}")
194
+
195
+ # Handle different device scenarios properly
196
+ if torch.cuda.is_available():
197
+ try:
198
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
199
+ model_id,
200
+ torch_dtype=torch.float16, # Use float16 to save memory
201
+ device_map="auto", # Auto device mapping for memory efficiency
202
+ trust_remote_code=trust_remote_code
203
+ )
204
+ except RuntimeError as e:
205
+ if "out of memory" in str(e).lower() or "offload_dir" in str(e):
206
+ print(f"Memory issue detected, using offloading: {e}")
207
+ import tempfile
208
+ with tempfile.TemporaryDirectory() as temp_dir:
209
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
210
+ model_id,
211
+ torch_dtype=torch.float16,
212
+ device_map="auto",
213
+ offload_folder=temp_dir,
214
+ max_memory={0: "24GB", "cpu": "48GB"},
215
+ trust_remote_code=trust_remote_code
216
+ )
217
+ else:
218
+ raise
219
+ else:
220
+ # CPU only
221
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
222
+ model_id,
223
+ torch_dtype=torch.float32, # Use float32 on CPU
224
+ device_map="cpu",
225
+ trust_remote_code=trust_remote_code
226
+ )
227
 
228
  print(f"Loading processor for: {model_id}")
229
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
quantize_qwen2_5_vl.py CHANGED
@@ -189,12 +189,39 @@ def quantize_qwen2_5_vl_model(
189
  Quantized model
190
  """
191
  print(f"Loading model: {model_id}")
192
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
193
- model_id,
194
- torch_dtype="auto",
195
- device_map=None, # Let the system decide device mapping
196
- trust_remote_code=trust_remote_code
197
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  print(f"Loading processor for: {model_id}")
200
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
 
189
  Quantized model
190
  """
191
  print(f"Loading model: {model_id}")
192
+
193
+ # Handle different device scenarios properly
194
+ if torch.cuda.is_available():
195
+ try:
196
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
197
+ model_id,
198
+ torch_dtype=torch.float16, # Use float16 to save memory on GPU
199
+ device_map="auto", # Auto device mapping for memory efficiency
200
+ trust_remote_code=trust_remote_code
201
+ )
202
+ except RuntimeError as e:
203
+ if "out of memory" in str(e).lower() or "offload_dir" in str(e):
204
+ print(f"Memory issue detected, using offloading: {e}")
205
+ import tempfile
206
+ with tempfile.TemporaryDirectory() as temp_dir:
207
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
208
+ model_id,
209
+ torch_dtype=torch.float16,
210
+ device_map="auto",
211
+ offload_folder=temp_dir,
212
+ max_memory={0: "24GB", "cpu": "48GB"},
213
+ trust_remote_code=trust_remote_code
214
+ )
215
+ else:
216
+ raise
217
+ else:
218
+ # CPU only
219
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
220
+ model_id,
221
+ torch_dtype=torch.float32, # Use float32 on CPU
222
+ device_map="cpu",
223
+ trust_remote_code=trust_remote_code
224
+ )
225
 
226
  print(f"Loading processor for: {model_id}")
227
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
tests/test_app.py CHANGED
@@ -111,8 +111,18 @@ def test_compress_and_upload_success(
111
  result = compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
112
 
113
  mock_whoami.assert_called_once_with(token="test_token")
 
 
 
 
 
 
 
 
 
 
114
  mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
115
- model_id, torch_dtype="auto", device_map=None, token="test_token", trust_remote_code=True
116
  )
117
  mock_oneshot.assert_called_once()
118
  assert mock_oneshot.call_args[1]["model"] == mock_auto_model_for_causal_lm.from_pretrained.return_value
@@ -148,8 +158,17 @@ def test_compress_and_upload_with_trust_remote_code(
148
  model_type_selection = "Auto-detect (recommended)"
149
  compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
150
 
 
 
 
 
 
 
 
 
 
151
  mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
152
- model_id, torch_dtype="auto", device_map=None, token="test_token", trust_remote_code=True
153
  )
154
 
155
 
 
111
  result = compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
112
 
113
  mock_whoami.assert_called_once_with(token="test_token")
114
+
115
+ # The device_map and torch_dtype should depend on CUDA availability
116
+ import torch
117
+ if torch.cuda.is_available():
118
+ expected_torch_dtype = torch.float16
119
+ expected_device_map = "auto"
120
+ else:
121
+ expected_torch_dtype = "auto"
122
+ expected_device_map = "cpu"
123
+
124
  mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
125
+ model_id, torch_dtype=expected_torch_dtype, device_map=expected_device_map, token="test_token", trust_remote_code=True
126
  )
127
  mock_oneshot.assert_called_once()
128
  assert mock_oneshot.call_args[1]["model"] == mock_auto_model_for_causal_lm.from_pretrained.return_value
 
158
  model_type_selection = "Auto-detect (recommended)"
159
  compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
160
 
161
+ # The device_map and torch_dtype should depend on CUDA availability
162
+ import torch
163
+ if torch.cuda.is_available():
164
+ expected_torch_dtype = torch.float16
165
+ expected_device_map = "auto"
166
+ else:
167
+ expected_torch_dtype = "auto"
168
+ expected_device_map = "cpu"
169
+
170
  mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
171
+ model_id, torch_dtype=expected_torch_dtype, device_map=expected_device_map, token="test_token", trust_remote_code=True
172
  )
173
 
174