Spaces:

n00b001
/

llm-compressor-my-repo

Sleeping

App Files Files Community

llm-compressor-my-repo / quantize_huihui_fara.py

n00b001

save

c2bdc87 unverified 11 days ago

raw

history blame contribute delete

15.2 kB

	#!/usr/bin/env python
	"""
	Script to quantize the huihui-ai/Huihui-Fara-7B-abliterated model with Qwen2.5-VL architecture support
	Uses sequential onloading for memory efficiency.
	"""

	import base64
	from io import BytesIO
	import torch
	from datasets import load_dataset
	from qwen_vl_utils import process_vision_info
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
	from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
	from llmcompressor.utils import dispatch_for_generation


	def create_qwen2_5_vl_data_collator():
	"""Create a data collator for Qwen2.5-VL models that handles multimodal inputs."""
	def data_collator(batch):
	assert len(batch) == 1
	return {key: torch.tensor(value) if isinstance(value, (list, int, float)) else value
	for key, value in batch[0].items()}
	return data_collator


	def create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length: int = 2048):
	"""Create a preprocessing function for Qwen2.5-VL datasets."""
	def preprocess_and_tokenize(example):
	# Handle different image formats
	if 'image' in example:
	# Process image
	if hasattr(example['image'], 'save'):
	# PIL Image object
	buffered = BytesIO()
	example["image"].save(buffered, format="PNG")
	encoded_image = base64.b64encode(buffered.getvalue())
	encoded_image_text = encoded_image.decode("utf-8")
	base64_qwen = f"data:image;base64,{encoded_image_text}"
	else:
	# Already a string or other format
	base64_qwen = str(example["image"])
	else:
	# If there's no image field, try 'img' or similar
	img_key = None
	for key in example.keys():
	if 'image' in key.lower() or 'img' in key.lower():
	img_key = key
	break
	if img_key:
	if hasattr(example[img_key], 'save'):
	buffered = BytesIO()
	example[img_key].save(buffered, format="PNG")
	encoded_image = base64.b64encode(buffered.getvalue())
	encoded_image_text = encoded_image.decode("utf-8")
	base64_qwen = f"data:image;base64,{encoded_image_text}"
	else:
	base64_qwen = str(example[img_key])
	else:
	# If no image, create a simple text-only example
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": example.get('text', example.get('content', 'What can you tell me about this?'))},
	],
	}
	]
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	return processor(
	text=[text],
	padding=False,
	max_length=max_sequence_length,
	truncation=True,
	)

	# Create message with image
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": base64_qwen},
	{"type": "text", "text": "What does the image show?"},
	],
	}
	]
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)

	# tokenize
	return processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=False,
	max_length=max_sequence_length,
	truncation=True,
	)

	return preprocess_and_tokenize


	def get_qwen2_5_vl_quantization_recipe(method: str, scheme: str = "W4A16"):
	"""
	Creates the appropriate quantization recipe for Qwen2.5-VL models.

	Args:
	method: Quantization method ("GPTQ", "AWQ", or "FP8")
	scheme: Quantization scheme (e.g., "W4A16", "W8A8", "FP8")

	Returns:
	List of modifiers for the quantization recipe
	"""
	if method == "GPTQ":
	return [
	GPTQModifier(
	targets="Linear",
	scheme=scheme,
	ignore=["lm_head", "re:visual.", "re:model.visual."],
	sequential_targets=["Qwen2_5_VLDecoderLayer"], # This enables sequential onloading
	),
	]
	elif method == "AWQ":
	# Create AWQ mappings for Qwen2.5-VL architecture
	mappings = [
	AWQMapping(
	"re:.input_layernorm", ["re:.q_proj", "re:.k_proj", "re:.v_proj"]
	),
	AWQMapping("re:.v_proj", ["re:.o_proj"]),
	AWQMapping(
	"re:.post_attention_layernorm", ["re:.gate_proj", "re:.*up_proj"]
	),
	AWQMapping("re:.up_proj", ["re:.down_proj"]),
	]
	return [
	AWQModifier(
	ignore=["lm_head", "re:visual.", "re:model.visual."],
	scheme="W4A16_ASYM" if scheme == "W4A16" else scheme,
	targets=["Linear"],
	mappings=mappings,
	sequential_targets=["Qwen2_5_VLDecoderLayer"], # Sequential onloading for memory efficiency
	),
	]
	elif method == "FP8":
	return [
	QuantizationModifier(
	scheme="FP8",
	targets="Linear",
	ignore=["lm_head", "re:visual.", "re:model.visual."]
	)
	]
	else:
	raise ValueError(f"Unsupported quantization method: {method}")


	def quantize_huihui_fara_model(
	model_id: str = "huihui-ai/Huihui-Fara-7B-abliterated",
	quantization_method: str = "GPTQ",
	output_dir: str = None,
	dataset_id: str = "wikitext",
	dataset_config: str = "wikitext-2-raw-v1",
	dataset_split: str = "train[:1%]",
	num_calibration_samples: int = 64,
	max_sequence_length: int = 512,
	scheme: str = "W4A16",
	trust_remote_code: bool = True,
	):
	"""
	Quantizes the huihui-ai/Huihui-Fara-7B-abliterated model with proper Qwen2.5-VL architecture support.

	Args:
	model_id: Hugging Face model ID to quantize
	quantization_method: Method to use ("GPTQ", "AWQ", or "FP8")
	output_dir: Directory to save the quantized model
	dataset_id: Dataset ID for calibration
	dataset_config: Dataset config for calibration
	dataset_split: Dataset split for calibration
	num_calibration_samples: Number of samples to use for calibration
	max_sequence_length: Maximum sequence length for processing
	scheme: Quantization scheme (e.g., "W4A16", "W8A8")
	trust_remote_code: Whether to trust remote code in model loading

	Returns:
	Quantized model
	"""
	print(f"Loading model: {model_id}")

	# Handle different device scenarios properly
	if torch.cuda.is_available():
	try:
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.float16, # Use float16 to save memory
	device_map="auto", # Auto device mapping for memory efficiency
	trust_remote_code=trust_remote_code
	)
	except RuntimeError as e:
	if "out of memory" in str(e).lower() or "offload_dir" in str(e):
	print(f"Memory issue detected, using offloading: {e}")
	import tempfile
	with tempfile.TemporaryDirectory() as temp_dir:
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto",
	offload_folder=temp_dir,
	max_memory={0: "24GB", "cpu": "48GB"},
	trust_remote_code=trust_remote_code
	)
	else:
	raise
	else:
	# CPU only
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.float32, # Use float32 on CPU
	device_map="cpu",
	trust_remote_code=trust_remote_code
	)

	print(f"Loading processor for: {model_id}")
	processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)

	# If output directory not specified, create one based on model and method
	if not output_dir:
	model_name = model_id.rstrip("/").split("/")[-1]
	output_dir = f"{model_name}-{scheme.replace(':', '-')}-{quantization_method}"

	print(f"Output directory: {output_dir}")

	# Load dataset and preprocess
	print(f"Loading dataset: {dataset_id}")
	try:
	# Try to load a multimodal dataset first
	ds = load_dataset("lmms-lab/flickr30k", split="test[:64]")
	print("Using multimodal dataset for calibration")

	preprocess_fn = create_qwen2_5_vl_preprocessing_fn(processor, max_sequence_length)
	ds = ds.map(preprocess_fn, remove_columns=ds.column_names)
	except Exception as e:
	print(f"Failed to load multimodal dataset: {e}, falling back to text-only dataset")
	# If multimodal dataset fails, use text-only
	ds = load_dataset(dataset_id, dataset_config, split=dataset_split)
	ds = ds.shuffle(seed=42)

	# Text-only preprocessing
	def text_only_preprocess(example):
	text = example.get('text', example.get('content', str(example)))
	if not isinstance(text, str):
	text = str(text)
	# Limit text length to avoid exceeding max sequence length
	text = text[:500] + "..." if len(text) > 500 else text
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": text},
	],
	}
	]
	prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	return processor(text=[prompt], padding=False, max_length=max_sequence_length, truncation=True)

	ds = ds.map(text_only_preprocess, remove_columns=ds.column_names)

	# Define data collator
	data_collator = create_qwen2_5_vl_data_collator()

	# Create recipe
	recipe = get_qwen2_5_vl_quantization_recipe(quantization_method, scheme)

	print(f"Starting quantization with method: {quantization_method}")
	print(f"Using recipe: {recipe}")
	print(f"Using sequential targets: {[mod.sequential_targets if hasattr(mod, 'sequential_targets') else 'N/A' for mod in recipe]}")

	# Perform oneshot quantization with sequential onloading for memory efficiency
	oneshot(
	model=model,
	tokenizer=processor, # Use processor as tokenizer for Qwen2.5-VL
	dataset=ds,
	recipe=recipe,
	max_seq_length=max_sequence_length,
	num_calibration_samples=num_calibration_samples,
	trust_remote_code_model=trust_remote_code,
	data_collator=data_collator,
	save_compressed=True,
	output_dir=output_dir,
	)

	print(f"Quantization completed! Model saved to: {output_dir}")

	# Save the processor as well
	processor.save_pretrained(output_dir)

	return model


	def test_quantized_model(model, processor, max_sequence_length: int = 2048):
	"""
	Tests the quantized model with a sample generation.
	"""
	print("========== SAMPLE GENERATION ==============")
	try:
	dispatch_for_generation(model)
	# Simple text-only test first
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Hello, how are you today?"},
	],
	}
	]
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(
	text=[prompt],
	padding=False,
	max_length=max_sequence_length,
	truncation=True,
	return_tensors="pt",
	).to(model.device)

	output = model.generate(**inputs, max_new_tokens=50)
	result = processor.decode(output[0], skip_special_tokens=True)
	print(result)
	print("==========================================")
	return result
	except Exception as e:
	print(f"Test generation failed: {e}")
	import traceback
	traceback.print_exc()
	return None


	def main():
	"""
	Main function to quantize the Huihui-Fara model.
	"""
	import argparse

	parser = argparse.ArgumentParser(description="Quantize huihui-ai/Huihui-Fara-7B-abliterated model")
	parser.add_argument("--model_id", type=str, default="huihui-ai/Huihui-Fara-7B-abliterated",
	help="Model ID to quantize")
	parser.add_argument("--method", type=str, choices=["GPTQ", "AWQ", "FP8"],
	default="GPTQ", help="Quantization method to use")
	parser.add_argument("--output_dir", type=str, default=None,
	help="Output directory for quantized model")
	parser.add_argument("--dataset_id", type=str, default="wikitext",
	help="Dataset for calibration (default: wikitext)")
	parser.add_argument("--scheme", type=str, default="W4A16",
	help="Quantization scheme (e.g., W4A16, W8A8)")
	parser.add_argument("--num_samples", type=int, default=64,
	help="Number of calibration samples")

	args = parser.parse_args()

	print(f"Starting quantization of {args.model_id} using {args.method}")
	print("Note: This may take a while and will use sequential onloading for memory efficiency...")

	try:
	# Quantize the model
	quantized_model = quantize_huihui_fara_model(
	model_id=args.model_id,
	quantization_method=args.method,
	output_dir=args.output_dir,
	dataset_id=args.dataset_id,
	num_calibration_samples=args.num_samples,
	scheme=args.scheme
	)

	# Test the model
	processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
	test_quantized_model(quantized_model, processor)

	print(f"✅ Successfully quantized {args.model_id} with {args.method}")
	print(f"Model saved to: {args.output_dir or args.model_id.split('/')[-1] + f'-{args.scheme}-{args.method}'}")

	except Exception as e:
	print(f"❌ Quantization failed: {e}")
	import traceback
	traceback.print_exc()


	if __name__ == "__main__":
	main()