Spaces:

n00b001
/

llm-compressor-my-repo

Running

App Files Files Community

n00b001 commited on 7 days ago

Commit

c2bdc87

unverified ·

1 Parent(s): f58eadc

save

Browse files

Files changed (4) hide show

app.py +74 -6
quantize_huihui_fara.py +33 -6
quantize_qwen2_5_vl.py +33 -6
tests/test_app.py +21 -2

app.py CHANGED Viewed

@@ -358,19 +358,87 @@ def compress_and_upload(
         try:
             # Show sub-steps during model loading
             progress(0.05, desc="Stage 1/5: Determining model class...")
-            model = model_class.from_pretrained(
-                model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True
-            )
             progress(0.15, desc="Stage 1/5: Model loaded, loading tokenizer...")
         except ValueError as e:
             if "Unrecognized configuration class" in str(e):
                 # If automatic detection fails, fall back to AutoModel and let transformers handle it
                 print(f"Automatic model class detection failed, falling back to AutoModel: {e}")
                 progress(0.05, desc="Stage 1/5: Using fallback model class...")
-                model = AutoModel.from_pretrained(
-                    model_id, torch_dtype="auto", device_map=None, token=token, trust_remote_code=True
-                )
                 progress(0.15, desc="Stage 1/5: Model loaded with fallback class...")
             else:
                 raise

         try:
             # Show sub-steps during model loading
             progress(0.05, desc="Stage 1/5: Determining model class...")
+            # Determine the optimal device configuration based on available resources
+            if torch.cuda.is_available():
+                # If CUDA is available, use auto device mapping to distribute model across available devices
+                model = model_class.from_pretrained(
+                    model_id,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
+                    device_map="auto",
+                    token=token,
+                    trust_remote_code=True
+                )
+            else:
+                # If no CUDA, load on CPU
+                model = model_class.from_pretrained(
+                    model_id,
+                    torch_dtype="auto",
+                    device_map="cpu",
+                    token=token,
+                    trust_remote_code=True
+                )
             progress(0.15, desc="Stage 1/5: Model loaded, loading tokenizer...")
         except ValueError as e:
             if "Unrecognized configuration class" in str(e):
                 # If automatic detection fails, fall back to AutoModel and let transformers handle it
                 print(f"Automatic model class detection failed, falling back to AutoModel: {e}")
                 progress(0.05, desc="Stage 1/5: Using fallback model class...")
+                if torch.cuda.is_available():
+                    model = AutoModel.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
+                        device_map="auto",
+                        token=token,
+                        trust_remote_code=True
+                    )
+                else:
+                    model = AutoModel.from_pretrained(
+                        model_id,
+                        torch_dtype="auto",
+                        device_map="cpu",
+                        token=token,
+                        trust_remote_code=True
+                    )
                 progress(0.15, desc="Stage 1/5: Model loaded with fallback class...")
+            elif "offload_dir" in str(e):
+                # If the error mentions offload_dir, try with disk offloading
+                print(f"Model requires offloading, trying with temporary offload directory: {e}")
+                progress(0.05, desc="Stage 1/5: Setting up model with offloading...")
+                import tempfile
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    model = model_class.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
+                        device_map="auto",
+                        offload_folder=temp_dir,
+                        token=token,
+                        trust_remote_code=True
+                    )
+                progress(0.15, desc="Stage 1/5: Model loaded with offloading...")
+            else:
+                raise
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower() or "offload_dir" in str(e):
+                # If there's an out of memory error or offload_dir error, try memory-efficient loading
+                print(f"Memory issue detected, trying with CPU offloading: {e}")
+                progress(0.05, desc="Stage 1/5: Setting up memory-efficient model loading...")
+                # Use CPU offloading to handle memory constraints
+                import tempfile
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    model = model_class.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
+                        device_map="auto",
+                        offload_folder=temp_dir,
+                        max_memory={0: "24GB", "cpu": "48GB"},  # Limit GPU memory usage
+                        token=token,
+                        trust_remote_code=True
+                    )
+                progress(0.15, desc="Stage 1/5: Model loaded with memory-efficient approach...")
             else:
                 raise

quantize_huihui_fara.py CHANGED Viewed

@@ -191,12 +191,39 @@ def quantize_huihui_fara_model(
         Quantized model
     """
     print(f"Loading model: {model_id}")
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16,  # Use float16 to save memory
-        device_map="auto",  # Auto device mapping for memory efficiency
-        trust_remote_code=trust_remote_code
-    )
     print(f"Loading processor for: {model_id}")
     processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)

         Quantized model
     """
     print(f"Loading model: {model_id}")
+    # Handle different device scenarios properly
+    if torch.cuda.is_available():
+        try:
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,  # Use float16 to save memory
+                device_map="auto",  # Auto device mapping for memory efficiency
+                trust_remote_code=trust_remote_code
+            )
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower() or "offload_dir" in str(e):
+                print(f"Memory issue detected, using offloading: {e}")
+                import tempfile
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                        offload_folder=temp_dir,
+                        max_memory={0: "24GB", "cpu": "48GB"},
+                        trust_remote_code=trust_remote_code
+                    )
+            else:
+                raise
+    else:
+        # CPU only
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,  # Use float32 on CPU
+            device_map="cpu",
+            trust_remote_code=trust_remote_code
+        )
     print(f"Loading processor for: {model_id}")
     processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)

quantize_qwen2_5_vl.py CHANGED Viewed

@@ -189,12 +189,39 @@ def quantize_qwen2_5_vl_model(
         Quantized model
     """
     print(f"Loading model: {model_id}")
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        model_id,
-        torch_dtype="auto",
-        device_map=None,  # Let the system decide device mapping
-        trust_remote_code=trust_remote_code
-    )
     print(f"Loading processor for: {model_id}")
     processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)

         Quantized model
     """
     print(f"Loading model: {model_id}")
+    # Handle different device scenarios properly
+    if torch.cuda.is_available():
+        try:
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,  # Use float16 to save memory on GPU
+                device_map="auto",  # Auto device mapping for memory efficiency
+                trust_remote_code=trust_remote_code
+            )
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower() or "offload_dir" in str(e):
+                print(f"Memory issue detected, using offloading: {e}")
+                import tempfile
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                        offload_folder=temp_dir,
+                        max_memory={0: "24GB", "cpu": "48GB"},
+                        trust_remote_code=trust_remote_code
+                    )
+            else:
+                raise
+    else:
+        # CPU only
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,  # Use float32 on CPU
+            device_map="cpu",
+            trust_remote_code=trust_remote_code
+        )
     print(f"Loading processor for: {model_id}")
     processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)

tests/test_app.py CHANGED Viewed

@@ -111,8 +111,18 @@ def test_compress_and_upload_success(
     result = compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
     mock_whoami.assert_called_once_with(token="test_token")
     mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
-        model_id, torch_dtype="auto", device_map=None, token="test_token", trust_remote_code=True
     )
     mock_oneshot.assert_called_once()
     assert mock_oneshot.call_args[1]["model"] == mock_auto_model_for_causal_lm.from_pretrained.return_value
@@ -148,8 +158,17 @@ def test_compress_and_upload_with_trust_remote_code(
     model_type_selection = "Auto-detect (recommended)"
     compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
     mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
-        model_id, torch_dtype="auto", device_map=None, token="test_token", trust_remote_code=True
     )

     result = compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
     mock_whoami.assert_called_once_with(token="test_token")
+    # The device_map and torch_dtype should depend on CUDA availability
+    import torch
+    if torch.cuda.is_available():
+        expected_torch_dtype = torch.float16
+        expected_device_map = "auto"
+    else:
+        expected_torch_dtype = "auto"
+        expected_device_map = "cpu"
     mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
+        model_id, torch_dtype=expected_torch_dtype, device_map=expected_device_map, token="test_token", trust_remote_code=True
     )
     mock_oneshot.assert_called_once()
     assert mock_oneshot.call_args[1]["model"] == mock_auto_model_for_causal_lm.from_pretrained.return_value
     model_type_selection = "Auto-detect (recommended)"
     compress_and_upload(model_id, quant_method, model_type_selection, mock_gr_oauth_token)
+    # The device_map and torch_dtype should depend on CUDA availability
+    import torch
+    if torch.cuda.is_available():
+        expected_torch_dtype = torch.float16
+        expected_device_map = "auto"
+    else:
+        expected_torch_dtype = "auto"
+        expected_device_map = "cpu"
     mock_auto_model_for_causal_lm.from_pretrained.assert_called_once_with(
+        model_id, torch_dtype=expected_torch_dtype, device_map=expected_device_map, token="test_token", trust_remote_code=True
     )