Delete tools
Browse files- tools/convert.py +0 -412
- tools/fix_5d_tensors.py +0 -85
- tools/fix_lines_ending.py +0 -31
- tools/lcpp.patch +0 -499
- tools/read_tensors.py +0 -21
- tools/tool_auto.py +0 -374
tools/convert.py
DELETED
|
@@ -1,412 +0,0 @@
|
|
| 1 |
-
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
| 2 |
-
import os
|
| 3 |
-
import gguf
|
| 4 |
-
import torch
|
| 5 |
-
import logging
|
| 6 |
-
import argparse
|
| 7 |
-
from tqdm import tqdm
|
| 8 |
-
from safetensors.torch import load_file, save_file
|
| 9 |
-
|
| 10 |
-
QUANTIZATION_THRESHOLD = 1024
|
| 11 |
-
REARRANGE_THRESHOLD = 512
|
| 12 |
-
MAX_TENSOR_NAME_LENGTH = 127
|
| 13 |
-
MAX_TENSOR_DIMS = 4
|
| 14 |
-
|
| 15 |
-
class ModelTemplate:
|
| 16 |
-
arch = "invalid" # string describing architecture
|
| 17 |
-
shape_fix = False # whether to reshape tensors
|
| 18 |
-
ndims_fix = False # whether to save fix file for tensors exceeding max dims
|
| 19 |
-
keys_detect = [] # list of lists to match in state dict
|
| 20 |
-
keys_banned = [] # list of keys that should mark model as invalid for conversion
|
| 21 |
-
keys_hiprec = [] # list of keys that need to be kept in fp32 for some reason
|
| 22 |
-
keys_ignore = [] # list of strings to ignore keys by when found
|
| 23 |
-
|
| 24 |
-
class ModelFlux(ModelTemplate):
|
| 25 |
-
arch = "flux"
|
| 26 |
-
keys_detect = [
|
| 27 |
-
("single_transformer_blocks.0.attn.norm_k.weight",),
|
| 28 |
-
("double_blocks.0.img_attn.proj.weight",),
|
| 29 |
-
]
|
| 30 |
-
keys_banned = ["single_transformer_blocks.0.attn.norm_k.weight",]
|
| 31 |
-
|
| 32 |
-
class ModelSD3(ModelTemplate):
|
| 33 |
-
arch = "sd3"
|
| 34 |
-
keys_detect = [
|
| 35 |
-
("transformer_blocks.0.ff_context.net.0.proj.weight",),
|
| 36 |
-
("joint_blocks.0.x_block.attn.qkv.weight",),
|
| 37 |
-
]
|
| 38 |
-
keys_banned = ["transformer_blocks.0.ff_context.net.0.proj.weight",]
|
| 39 |
-
|
| 40 |
-
class ModelAura(ModelTemplate):
|
| 41 |
-
arch = "aura"
|
| 42 |
-
keys_detect = [
|
| 43 |
-
("double_layers.3.modX.1.weight",),
|
| 44 |
-
("joint_transformer_blocks.3.ff_context.out_projection.weight",),
|
| 45 |
-
]
|
| 46 |
-
keys_banned = ["joint_transformer_blocks.3.ff_context.out_projection.weight",]
|
| 47 |
-
|
| 48 |
-
class ModelHiDream(ModelTemplate):
|
| 49 |
-
arch = "hidream"
|
| 50 |
-
keys_detect = [
|
| 51 |
-
(
|
| 52 |
-
"caption_projection.0.linear.weight",
|
| 53 |
-
"double_stream_blocks.0.block.ff_i.shared_experts.w3.weight"
|
| 54 |
-
)
|
| 55 |
-
]
|
| 56 |
-
keys_hiprec = [
|
| 57 |
-
# nn.parameter, can't load from BF16 ver
|
| 58 |
-
".ff_i.gate.weight",
|
| 59 |
-
"img_emb.emb_pos"
|
| 60 |
-
]
|
| 61 |
-
|
| 62 |
-
class ModelCosmosPredict2(ModelTemplate):
|
| 63 |
-
arch = "cosmos"
|
| 64 |
-
keys_detect = [
|
| 65 |
-
(
|
| 66 |
-
"blocks.0.mlp.layer1.weight",
|
| 67 |
-
"blocks.0.adaln_modulation_cross_attn.1.weight",
|
| 68 |
-
)
|
| 69 |
-
]
|
| 70 |
-
keys_hiprec = ["pos_embedder"]
|
| 71 |
-
keys_ignore = ["_extra_state", "accum_"]
|
| 72 |
-
|
| 73 |
-
class ModelQwenImage(ModelTemplate):
|
| 74 |
-
arch = "qwen_image"
|
| 75 |
-
keys_detect = [
|
| 76 |
-
(
|
| 77 |
-
"time_text_embed.timestep_embedder.linear_2.weight",
|
| 78 |
-
"transformer_blocks.0.attn.norm_added_q.weight",
|
| 79 |
-
"transformer_blocks.0.img_mlp.net.0.proj.weight",
|
| 80 |
-
)
|
| 81 |
-
]
|
| 82 |
-
|
| 83 |
-
class ModelHyVid(ModelTemplate):
|
| 84 |
-
arch = "hyvid"
|
| 85 |
-
ndims_fix = True
|
| 86 |
-
keys_detect = [
|
| 87 |
-
(
|
| 88 |
-
"double_blocks.0.img_attn_proj.weight",
|
| 89 |
-
"txt_in.individual_token_refiner.blocks.1.self_attn_qkv.weight",
|
| 90 |
-
)
|
| 91 |
-
]
|
| 92 |
-
|
| 93 |
-
class ModelWan(ModelTemplate):
|
| 94 |
-
arch = "wan"
|
| 95 |
-
ndims_fix = True
|
| 96 |
-
keys_detect = [
|
| 97 |
-
(
|
| 98 |
-
"blocks.0.self_attn.norm_q.weight",
|
| 99 |
-
"text_embedding.2.weight",
|
| 100 |
-
"head.modulation",
|
| 101 |
-
)
|
| 102 |
-
]
|
| 103 |
-
keys_hiprec = [
|
| 104 |
-
".modulation", # nn.parameter, can't load from BF16 ver
|
| 105 |
-
".encoder.padding_tokens", # nn.parameter, specific to S2V
|
| 106 |
-
"trainable_cond_mask", # used directly w/ .weight
|
| 107 |
-
"casual_audio_encoder.weights", # nn.parameter, specific to S2V
|
| 108 |
-
"casual_audio_encoder.encoder.conv", # CausalConv1d doesn't use ops.py for now
|
| 109 |
-
]
|
| 110 |
-
|
| 111 |
-
class ModelLTXV(ModelTemplate):
|
| 112 |
-
arch = "ltxv"
|
| 113 |
-
keys_detect = [
|
| 114 |
-
(
|
| 115 |
-
"adaln_single.emb.timestep_embedder.linear_2.weight",
|
| 116 |
-
"transformer_blocks.27.scale_shift_table",
|
| 117 |
-
"caption_projection.linear_2.weight",
|
| 118 |
-
)
|
| 119 |
-
]
|
| 120 |
-
keys_hiprec = [
|
| 121 |
-
"scale_shift_table" # nn.parameter, can't load from BF16 base quant
|
| 122 |
-
]
|
| 123 |
-
|
| 124 |
-
class ModelSDXL(ModelTemplate):
|
| 125 |
-
arch = "sdxl"
|
| 126 |
-
shape_fix = True
|
| 127 |
-
keys_detect = [
|
| 128 |
-
("down_blocks.0.downsamplers.0.conv.weight", "add_embedding.linear_1.weight",),
|
| 129 |
-
(
|
| 130 |
-
"input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight",
|
| 131 |
-
"output_blocks.2.2.conv.weight", "output_blocks.5.2.conv.weight",
|
| 132 |
-
), # Non-diffusers
|
| 133 |
-
("label_emb.0.0.weight",),
|
| 134 |
-
]
|
| 135 |
-
|
| 136 |
-
class ModelSD1(ModelTemplate):
|
| 137 |
-
arch = "sd1"
|
| 138 |
-
shape_fix = True
|
| 139 |
-
keys_detect = [
|
| 140 |
-
("down_blocks.0.downsamplers.0.conv.weight",),
|
| 141 |
-
(
|
| 142 |
-
"input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight", "input_blocks.9.0.op.weight",
|
| 143 |
-
"output_blocks.2.1.conv.weight", "output_blocks.5.2.conv.weight", "output_blocks.8.2.conv.weight"
|
| 144 |
-
), # Non-diffusers
|
| 145 |
-
]
|
| 146 |
-
|
| 147 |
-
class ModelLumina2(ModelTemplate):
|
| 148 |
-
arch = "lumina2"
|
| 149 |
-
keys_detect = [
|
| 150 |
-
("cap_embedder.1.weight", "context_refiner.0.attention.qkv.weight")
|
| 151 |
-
]
|
| 152 |
-
|
| 153 |
-
class ModelHuMo(ModelTemplate):
|
| 154 |
-
arch = "humo"
|
| 155 |
-
ndims_fix = True
|
| 156 |
-
keys_detect = [
|
| 157 |
-
("blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",),
|
| 158 |
-
("audio_proj.audio_proj_glob_1.layer.weight",),
|
| 159 |
-
(
|
| 160 |
-
"blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",
|
| 161 |
-
"blocks.0.self_attn.norm_q.weight",
|
| 162 |
-
"text_embedding.2.weight",
|
| 163 |
-
"head.modulation"
|
| 164 |
-
),
|
| 165 |
-
]
|
| 166 |
-
keys_hiprec = ["patch_embedding", "text_embedding", "time_embedding", ".modulation"]
|
| 167 |
-
|
| 168 |
-
# The architectures are checked in order and the first successful match terminates the search.
|
| 169 |
-
arch_list = [
|
| 170 |
-
ModelFlux, ModelSD3, ModelAura, ModelHiDream, ModelCosmosPredict2, ModelQwenImage,
|
| 171 |
-
ModelLTXV, ModelHyVid, ModelHuMo, ModelWan, ModelSDXL, ModelSD1, ModelLumina2
|
| 172 |
-
]
|
| 173 |
-
|
| 174 |
-
def is_model_arch(model, state_dict):
|
| 175 |
-
# check if model is correct
|
| 176 |
-
matched = False
|
| 177 |
-
invalid = False
|
| 178 |
-
# print(state_dict)
|
| 179 |
-
for match_list in model.keys_detect:
|
| 180 |
-
if all(key in state_dict for key in match_list):
|
| 181 |
-
matched = True
|
| 182 |
-
invalid = any(key in state_dict for key in model.keys_banned)
|
| 183 |
-
break
|
| 184 |
-
assert not invalid, f"Model architecture not allowed for conversion! (i.e. reference VS diffusers format) [arch:{model.arch}]"
|
| 185 |
-
return matched
|
| 186 |
-
|
| 187 |
-
def detect_arch(state_dict):
|
| 188 |
-
model_arch = None
|
| 189 |
-
for arch in arch_list:
|
| 190 |
-
if is_model_arch(arch, state_dict):
|
| 191 |
-
model_arch = arch()
|
| 192 |
-
break
|
| 193 |
-
assert model_arch is not None, "Unknown model architecture!"
|
| 194 |
-
return model_arch
|
| 195 |
-
|
| 196 |
-
def parse_args():
|
| 197 |
-
parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
|
| 198 |
-
parser.add_argument("--src", required=True, help="Source model ckpt file.")
|
| 199 |
-
parser.add_argument("--dst", help="Output unet gguf file.")
|
| 200 |
-
args = parser.parse_args()
|
| 201 |
-
|
| 202 |
-
if not os.path.isfile(args.src):
|
| 203 |
-
parser.error("No input provided!")
|
| 204 |
-
|
| 205 |
-
return args
|
| 206 |
-
|
| 207 |
-
def strip_prefix(state_dict):
|
| 208 |
-
# prefix for mixed state dict
|
| 209 |
-
prefix = None
|
| 210 |
-
for pfx in ["model.diffusion_model.", "model."]:
|
| 211 |
-
if any([x.startswith(pfx) for x in state_dict.keys()]):
|
| 212 |
-
prefix = pfx
|
| 213 |
-
break
|
| 214 |
-
|
| 215 |
-
# prefix for uniform state dict
|
| 216 |
-
if prefix is None:
|
| 217 |
-
for pfx in ["net."]:
|
| 218 |
-
if all([x.startswith(pfx) for x in state_dict.keys()]):
|
| 219 |
-
prefix = pfx
|
| 220 |
-
break
|
| 221 |
-
|
| 222 |
-
# strip prefix if found
|
| 223 |
-
if prefix is not None:
|
| 224 |
-
logging.info(f"State dict prefix found: '{prefix}'")
|
| 225 |
-
sd = {}
|
| 226 |
-
for k, v in state_dict.items():
|
| 227 |
-
if prefix not in k:
|
| 228 |
-
continue
|
| 229 |
-
k = k.replace(prefix, "")
|
| 230 |
-
sd[k] = v
|
| 231 |
-
else:
|
| 232 |
-
logging.debug("State dict has no prefix")
|
| 233 |
-
sd = state_dict
|
| 234 |
-
|
| 235 |
-
return sd
|
| 236 |
-
|
| 237 |
-
def find_main_dtype(state_dict, allow_fp32=False):
|
| 238 |
-
# detect most common dtype in input
|
| 239 |
-
dtypes = [x.dtype for x in state_dict.values()]
|
| 240 |
-
dtypes = {x:dtypes.count(x) for x in set(dtypes)}
|
| 241 |
-
main_dtype = max(dtypes, key=dtypes.get)
|
| 242 |
-
|
| 243 |
-
if main_dtype == torch.bfloat16:
|
| 244 |
-
ftype_name = "BF16"
|
| 245 |
-
ftype_gguf = gguf.LlamaFileType.MOSTLY_BF16
|
| 246 |
-
elif main_dtype == torch.float32 and allow_fp32:
|
| 247 |
-
ftype_name = "F32"
|
| 248 |
-
ftype_gguf = gguf.LlamaFileType.ALL_F32
|
| 249 |
-
else:
|
| 250 |
-
ftype_name = "F16"
|
| 251 |
-
ftype_gguf = gguf.LlamaFileType.MOSTLY_F16
|
| 252 |
-
|
| 253 |
-
return ftype_name, ftype_gguf
|
| 254 |
-
|
| 255 |
-
def load_state_dict(path):
|
| 256 |
-
if any(path.endswith(x) for x in [".ckpt", ".pt", ".bin", ".pth"]):
|
| 257 |
-
state_dict = torch.load(path, map_location="cpu", weights_only=True)
|
| 258 |
-
for subkey in ["model", "module"]:
|
| 259 |
-
if subkey in state_dict:
|
| 260 |
-
state_dict = state_dict[subkey]
|
| 261 |
-
break
|
| 262 |
-
if len(state_dict) < 20:
|
| 263 |
-
raise RuntimeError(f"pt subkey load failed: {state_dict.keys()}")
|
| 264 |
-
else:
|
| 265 |
-
state_dict = load_file(path)
|
| 266 |
-
|
| 267 |
-
return strip_prefix(state_dict)
|
| 268 |
-
|
| 269 |
-
def handle_tensors(writer, state_dict, model_arch, allow_fp32=False):
|
| 270 |
-
name_lengths = tuple(sorted(
|
| 271 |
-
((key, len(key)) for key in state_dict.keys()),
|
| 272 |
-
key=lambda item: item[1],
|
| 273 |
-
reverse=True,
|
| 274 |
-
))
|
| 275 |
-
if not name_lengths:
|
| 276 |
-
return
|
| 277 |
-
max_name_len = name_lengths[0][1]
|
| 278 |
-
|
| 279 |
-
if max_name_len > MAX_TENSOR_NAME_LENGTH:
|
| 280 |
-
bad_list = ", ".join(f"{key!r} ({namelen})" for key, namelen in name_lengths if namelen > MAX_TENSOR_NAME_LENGTH)
|
| 281 |
-
raise ValueError(f"Can only handle tensor names up to {MAX_TENSOR_NAME_LENGTH} characters. Tensors exceeding the limit: {bad_list}")
|
| 282 |
-
|
| 283 |
-
invalid_tensors = {}
|
| 284 |
-
quantized_tensors = {}
|
| 285 |
-
for key, data in tqdm(state_dict.items()):
|
| 286 |
-
old_dtype = data.dtype
|
| 287 |
-
|
| 288 |
-
if any(x in key for x in model_arch.keys_ignore):
|
| 289 |
-
tqdm.write(f"Filtering ignored key: '{key}'")
|
| 290 |
-
continue
|
| 291 |
-
|
| 292 |
-
if data.dtype == torch.bfloat16:
|
| 293 |
-
data = data.to(torch.float32).numpy()
|
| 294 |
-
# this is so we don't break torch 2.0.X
|
| 295 |
-
elif data.dtype in [getattr(torch, "float8_e4m3fn", "_invalid"), getattr(torch, "float8_e5m2", "_invalid")]:
|
| 296 |
-
data = data.to(torch.float16).numpy()
|
| 297 |
-
else:
|
| 298 |
-
data = data.numpy()
|
| 299 |
-
|
| 300 |
-
n_dims = len(data.shape)
|
| 301 |
-
data_shape = data.shape
|
| 302 |
-
if old_dtype == torch.bfloat16:
|
| 303 |
-
data_qtype = gguf.GGMLQuantizationType.BF16
|
| 304 |
-
elif old_dtype == torch.float32 and allow_fp32:
|
| 305 |
-
data_qtype = gguf.GGMLQuantizationType.F32
|
| 306 |
-
else:
|
| 307 |
-
data_qtype = gguf.GGMLQuantizationType.F16
|
| 308 |
-
|
| 309 |
-
# The max no. of dimensions that can be handled by the quantization code is 4
|
| 310 |
-
if len(data.shape) > MAX_TENSOR_DIMS:
|
| 311 |
-
invalid_tensors[key] = data
|
| 312 |
-
continue # needs to be added back later
|
| 313 |
-
|
| 314 |
-
# get number of parameters (AKA elements) in this tensor
|
| 315 |
-
n_params = 1
|
| 316 |
-
for dim_size in data_shape:
|
| 317 |
-
n_params *= dim_size
|
| 318 |
-
|
| 319 |
-
if old_dtype in (torch.float32, torch.bfloat16):
|
| 320 |
-
if n_dims == 1:
|
| 321 |
-
# one-dimensional tensors should be kept in F32
|
| 322 |
-
# also speeds up inference due to not dequantizing
|
| 323 |
-
data_qtype = gguf.GGMLQuantizationType.F32
|
| 324 |
-
|
| 325 |
-
elif n_params <= QUANTIZATION_THRESHOLD:
|
| 326 |
-
# very small tensors
|
| 327 |
-
data_qtype = gguf.GGMLQuantizationType.F32
|
| 328 |
-
|
| 329 |
-
elif any(x in key for x in model_arch.keys_hiprec):
|
| 330 |
-
# tensors that require max precision
|
| 331 |
-
data_qtype = gguf.GGMLQuantizationType.F32
|
| 332 |
-
|
| 333 |
-
if (model_arch.shape_fix # NEVER reshape for models such as flux
|
| 334 |
-
and n_dims > 1 # Skip one-dimensional tensors
|
| 335 |
-
and n_params >= REARRANGE_THRESHOLD # Only rearrange tensors meeting the size requirement
|
| 336 |
-
and (n_params / 256).is_integer() # Rearranging only makes sense if total elements is divisible by 256
|
| 337 |
-
and not (data.shape[-1] / 256).is_integer() # Only need to rearrange if the last dimension is not divisible by 256
|
| 338 |
-
):
|
| 339 |
-
orig_shape = data.shape
|
| 340 |
-
data = data.reshape(n_params // 256, 256)
|
| 341 |
-
writer.add_array(f"comfy.gguf.orig_shape.{key}", tuple(int(dim) for dim in orig_shape))
|
| 342 |
-
|
| 343 |
-
try:
|
| 344 |
-
data = gguf.quants.quantize(data, data_qtype)
|
| 345 |
-
quantized_tensors[key] = data_qtype
|
| 346 |
-
except (AttributeError, gguf.QuantError) as e:
|
| 347 |
-
tqdm.write(f"falling back to F16: {e}")
|
| 348 |
-
data_qtype = gguf.GGMLQuantizationType.F16
|
| 349 |
-
data = gguf.quants.quantize(data, data_qtype)
|
| 350 |
-
quantized_tensors[key] = data_qtype
|
| 351 |
-
|
| 352 |
-
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
| 353 |
-
tqdm.write(f"{f'%-{max_name_len + 4}s' % f'{key}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
| 354 |
-
|
| 355 |
-
writer.add_tensor(key, data, raw_dtype=data_qtype)
|
| 356 |
-
|
| 357 |
-
return quantized_tensors, invalid_tensors
|
| 358 |
-
|
| 359 |
-
def convert_file(path, dst_path=None, interact=True, overwrite=False, allow_fp32=False):
|
| 360 |
-
# load & run model detection logic
|
| 361 |
-
state_dict = load_state_dict(path)
|
| 362 |
-
model_arch = detect_arch(state_dict)
|
| 363 |
-
logging.info(f"* Architecture detected from input: {model_arch.arch}")
|
| 364 |
-
|
| 365 |
-
ftype_name, ftype_gguf = find_main_dtype(state_dict, allow_fp32=allow_fp32)
|
| 366 |
-
|
| 367 |
-
if dst_path is None:
|
| 368 |
-
dst_path = f"{os.path.splitext(path)[0]}-{ftype_name}.gguf"
|
| 369 |
-
elif "{ftype}" in dst_path: # lcpp logic
|
| 370 |
-
dst_path = dst_path.replace("{ftype}", ftype_name)
|
| 371 |
-
|
| 372 |
-
if os.path.isfile(dst_path) and not overwrite:
|
| 373 |
-
if interact:
|
| 374 |
-
input("Output exists enter to continue or ctrl+c to abort!")
|
| 375 |
-
else:
|
| 376 |
-
raise OSError("Output exists and overwriting is disabled!")
|
| 377 |
-
|
| 378 |
-
# handle actual file
|
| 379 |
-
writer = gguf.GGUFWriter(path=None, arch=model_arch.arch)
|
| 380 |
-
writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
| 381 |
-
if ftype_gguf is not None:
|
| 382 |
-
writer.add_file_type(ftype_gguf)
|
| 383 |
-
|
| 384 |
-
quantized_tensors, invalid_tensors = handle_tensors(writer, state_dict, model_arch, allow_fp32=allow_fp32)
|
| 385 |
-
if len(invalid_tensors) > 0:
|
| 386 |
-
if not model_arch.ndims_fix: # only applies to 5D fix for now, possibly expand to cover more cases?
|
| 387 |
-
raise ValueError(f"Tensor(s) detected that exceeds dims supported by C++ code! ({invalid_tensors.keys()})")
|
| 388 |
-
|
| 389 |
-
fix_path = os.path.join(
|
| 390 |
-
os.path.dirname(dst_path),
|
| 391 |
-
f"fix_5d_tensors_{model_arch.arch}.safetensors"
|
| 392 |
-
)
|
| 393 |
-
if os.path.isfile(fix_path):
|
| 394 |
-
raise RuntimeError(f"Tensor fix file already exists! {path}")
|
| 395 |
-
|
| 396 |
-
invalid_tensors = {k:torch.from_numpy(v.copy()) for k,v in invalid_tensors.items()}
|
| 397 |
-
save_file(invalid_tensors, fix_path)
|
| 398 |
-
logging.warning(f"\n### Warning! Fix file found at '{fix_path}'")
|
| 399 |
-
logging.warning(" you most likely need to run 'fix_5d_tensors.py' after quantization.")
|
| 400 |
-
else:
|
| 401 |
-
fix_path = None
|
| 402 |
-
|
| 403 |
-
writer.write_header_to_file(path=dst_path)
|
| 404 |
-
writer.write_kv_data_to_file()
|
| 405 |
-
writer.write_tensors_to_file(progress=True)
|
| 406 |
-
writer.close()
|
| 407 |
-
|
| 408 |
-
return dst_path, model_arch, fix_path
|
| 409 |
-
|
| 410 |
-
if __name__ == "__main__":
|
| 411 |
-
args = parse_args()
|
| 412 |
-
convert_file(args.src, args.dst)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/fix_5d_tensors.py
DELETED
|
@@ -1,85 +0,0 @@
|
|
| 1 |
-
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
| 2 |
-
import os
|
| 3 |
-
import gguf
|
| 4 |
-
import torch
|
| 5 |
-
import argparse
|
| 6 |
-
from tqdm import tqdm
|
| 7 |
-
from safetensors.torch import load_file
|
| 8 |
-
|
| 9 |
-
def get_args():
|
| 10 |
-
parser = argparse.ArgumentParser()
|
| 11 |
-
parser.add_argument("--src", required=True)
|
| 12 |
-
parser.add_argument("--dst", required=True)
|
| 13 |
-
parser.add_argument("--fix", required=False, help="Defaults to ./fix_5d_tensors_[arch].pt")
|
| 14 |
-
parser.add_argument("--overwrite", action="store_true")
|
| 15 |
-
args = parser.parse_args()
|
| 16 |
-
|
| 17 |
-
if not os.path.isfile(args.src):
|
| 18 |
-
parser.error(f"Invalid source file '{args.src}'")
|
| 19 |
-
if not args.overwrite and os.path.exists(args.dst):
|
| 20 |
-
parser.error(f"Output exists, use '--overwrite' ({args.dst})")
|
| 21 |
-
|
| 22 |
-
return args
|
| 23 |
-
|
| 24 |
-
def get_arch_str(reader):
|
| 25 |
-
field = reader.get_field("general.architecture")
|
| 26 |
-
return str(field.parts[field.data[-1]], encoding="utf-8")
|
| 27 |
-
|
| 28 |
-
def get_file_type(reader):
|
| 29 |
-
field = reader.get_field("general.file_type")
|
| 30 |
-
ft = int(field.parts[field.data[-1]])
|
| 31 |
-
return gguf.LlamaFileType(ft)
|
| 32 |
-
|
| 33 |
-
def apply_5d_fix(src, dst, fix=None, overwrite=False):
|
| 34 |
-
# read existing
|
| 35 |
-
reader = gguf.GGUFReader(src)
|
| 36 |
-
arch = get_arch_str(reader)
|
| 37 |
-
file_type = get_file_type(reader)
|
| 38 |
-
print(f"Detected arch: '{arch}' (ftype: {str(file_type)})")
|
| 39 |
-
|
| 40 |
-
# prep fix
|
| 41 |
-
if fix is None:
|
| 42 |
-
fix = f"./fix_5d_tensors_{arch}.safetensors"
|
| 43 |
-
|
| 44 |
-
if not os.path.isfile(fix):
|
| 45 |
-
raise OSError(f"No 5D tensor fix file: {fix}")
|
| 46 |
-
|
| 47 |
-
sd5d = load_file(fix)
|
| 48 |
-
sd5d = {k:v.numpy() for k,v in sd5d.items()}
|
| 49 |
-
print("5D tensors:", sd5d.keys())
|
| 50 |
-
|
| 51 |
-
# prep output
|
| 52 |
-
writer = gguf.GGUFWriter(path=None, arch=arch)
|
| 53 |
-
writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
| 54 |
-
writer.add_file_type(file_type)
|
| 55 |
-
|
| 56 |
-
global added
|
| 57 |
-
added = []
|
| 58 |
-
def add_extra_key(writer, key, data):
|
| 59 |
-
global added
|
| 60 |
-
data_qtype = gguf.GGMLQuantizationType.F32
|
| 61 |
-
data = gguf.quants.quantize(data, data_qtype)
|
| 62 |
-
tqdm.write(f"Adding key {key} ({data.shape})")
|
| 63 |
-
writer.add_tensor(key, data, raw_dtype=data_qtype)
|
| 64 |
-
added.append(key)
|
| 65 |
-
|
| 66 |
-
# main loop to add missing 5D tensor(s)
|
| 67 |
-
for tensor in tqdm(reader.tensors):
|
| 68 |
-
writer.add_tensor(tensor.name, tensor.data, raw_dtype=tensor.tensor_type)
|
| 69 |
-
key5d = tensor.name.replace(".bias", ".weight")
|
| 70 |
-
if key5d in sd5d.keys():
|
| 71 |
-
add_extra_key(writer, key5d, sd5d[key5d])
|
| 72 |
-
|
| 73 |
-
# brute force for any missed
|
| 74 |
-
for key, data in sd5d.items():
|
| 75 |
-
if key not in added:
|
| 76 |
-
add_extra_key(writer, key, data)
|
| 77 |
-
|
| 78 |
-
writer.write_header_to_file(path=dst)
|
| 79 |
-
writer.write_kv_data_to_file()
|
| 80 |
-
writer.write_tensors_to_file(progress=True)
|
| 81 |
-
writer.close()
|
| 82 |
-
|
| 83 |
-
if __name__ == "__main__":
|
| 84 |
-
args = get_args()
|
| 85 |
-
apply_5d_fix(args.src, args.dst, fix=args.fix, overwrite=args.overwrite)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/fix_lines_ending.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
|
| 3 |
-
files = ["lcpp.patch", "lcpp_sd3.patch"]
|
| 4 |
-
|
| 5 |
-
def has_unix_line_endings(file_path):
|
| 6 |
-
try:
|
| 7 |
-
with open(file_path, 'rb') as file:
|
| 8 |
-
content = file.read()
|
| 9 |
-
return b'\r\n' not in content
|
| 10 |
-
except Exception as e:
|
| 11 |
-
print(f"Error checking '{file_path}': {e}")
|
| 12 |
-
return False
|
| 13 |
-
|
| 14 |
-
def convert_to_linux_format(file_path):
|
| 15 |
-
try:
|
| 16 |
-
with open(file_path, 'rb') as file:
|
| 17 |
-
content = file.read().replace(b'\r\n', b'\n')
|
| 18 |
-
with open(file_path, 'wb') as file:
|
| 19 |
-
file.write(content)
|
| 20 |
-
print(f"'{file_path}' converted to Linux line endings (LF).")
|
| 21 |
-
except Exception as e:
|
| 22 |
-
print(f"Error processing '{file_path}': {e}")
|
| 23 |
-
|
| 24 |
-
for file in files:
|
| 25 |
-
if os.path.exists(file):
|
| 26 |
-
if has_unix_line_endings(file):
|
| 27 |
-
print(f"'{file}' already has Unix line endings (LF). No conversion needed.")
|
| 28 |
-
else:
|
| 29 |
-
convert_to_linux_format(file)
|
| 30 |
-
else:
|
| 31 |
-
print(f"File '{file}' does not exist.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/lcpp.patch
DELETED
|
@@ -1,499 +0,0 @@
|
|
| 1 |
-
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
| 2 |
-
index de3c706f..0267c1fa 100644
|
| 3 |
-
--- a/ggml/include/ggml.h
|
| 4 |
-
+++ b/ggml/include/ggml.h
|
| 5 |
-
@@ -223,7 +223,7 @@
|
| 6 |
-
#define GGML_MAX_OP_PARAMS 64
|
| 7 |
-
|
| 8 |
-
#ifndef GGML_MAX_NAME
|
| 9 |
-
-# define GGML_MAX_NAME 64
|
| 10 |
-
+# define GGML_MAX_NAME 128
|
| 11 |
-
#endif
|
| 12 |
-
|
| 13 |
-
#define GGML_DEFAULT_N_THREADS 4
|
| 14 |
-
@@ -2449,6 +2449,7 @@ extern "C" {
|
| 15 |
-
|
| 16 |
-
// manage tensor info
|
| 17 |
-
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
| 18 |
-
+ GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
|
| 19 |
-
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
| 20 |
-
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
| 21 |
-
|
| 22 |
-
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
| 23 |
-
index b16c462f..6d1568f1 100644
|
| 24 |
-
--- a/ggml/src/ggml.c
|
| 25 |
-
+++ b/ggml/src/ggml.c
|
| 26 |
-
@@ -22960,6 +22960,14 @@ void gguf_add_tensor(
|
| 27 |
-
ctx->header.n_tensors++;
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
+void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
|
| 31 |
-
+ const int idx = gguf_find_tensor(ctx, name);
|
| 32 |
-
+ if (idx < 0) {
|
| 33 |
-
+ GGML_ABORT("tensor not found");
|
| 34 |
-
+ }
|
| 35 |
-
+ ctx->infos[idx].n_dims = n_dim;
|
| 36 |
-
+}
|
| 37 |
-
+
|
| 38 |
-
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
| 39 |
-
const int idx = gguf_find_tensor(ctx, name);
|
| 40 |
-
if (idx < 0) {
|
| 41 |
-
diff --git a/src/llama.cpp b/src/llama.cpp
|
| 42 |
-
index 24e1f1f0..8a1e9ef8 100644
|
| 43 |
-
--- a/src/llama.cpp
|
| 44 |
-
+++ b/src/llama.cpp
|
| 45 |
-
@@ -205,6 +205,18 @@ enum llm_arch {
|
| 46 |
-
LLM_ARCH_GRANITE,
|
| 47 |
-
LLM_ARCH_GRANITE_MOE,
|
| 48 |
-
LLM_ARCH_CHAMELEON,
|
| 49 |
-
+ LLM_ARCH_FLUX,
|
| 50 |
-
+ LLM_ARCH_SD1,
|
| 51 |
-
+ LLM_ARCH_SDXL,
|
| 52 |
-
+ LLM_ARCH_SD3,
|
| 53 |
-
+ LLM_ARCH_AURA,
|
| 54 |
-
+ LLM_ARCH_LTXV,
|
| 55 |
-
+ LLM_ARCH_HYVID,
|
| 56 |
-
+ LLM_ARCH_WAN,
|
| 57 |
-
+ LLM_ARCH_HIDREAM,
|
| 58 |
-
+ LLM_ARCH_COSMOS,
|
| 59 |
-
+ LLM_ARCH_LUMINA2,
|
| 60 |
-
+ LLM_ARCH_QWEN_IMAGE,
|
| 61 |
-
LLM_ARCH_UNKNOWN,
|
| 62 |
-
};
|
| 63 |
-
|
| 64 |
-
@@ -258,6 +270,18 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
| 65 |
-
{ LLM_ARCH_GRANITE, "granite" },
|
| 66 |
-
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
| 67 |
-
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
| 68 |
-
+ { LLM_ARCH_FLUX, "flux" },
|
| 69 |
-
+ { LLM_ARCH_SD1, "sd1" },
|
| 70 |
-
+ { LLM_ARCH_SDXL, "sdxl" },
|
| 71 |
-
+ { LLM_ARCH_SD3, "sd3" },
|
| 72 |
-
+ { LLM_ARCH_AURA, "aura" },
|
| 73 |
-
+ { LLM_ARCH_LTXV, "ltxv" },
|
| 74 |
-
+ { LLM_ARCH_HYVID, "hyvid" },
|
| 75 |
-
+ { LLM_ARCH_WAN, "wan" },
|
| 76 |
-
+ { LLM_ARCH_HIDREAM, "hidream" },
|
| 77 |
-
+ { LLM_ARCH_COSMOS, "cosmos" },
|
| 78 |
-
+ { LLM_ARCH_LUMINA2, "lumina2" },
|
| 79 |
-
+ { LLM_ARCH_QWEN_IMAGE, "qwen_image" },
|
| 80 |
-
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 81 |
-
};
|
| 82 |
-
|
| 83 |
-
@@ -1531,6 +1555,18 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
| 84 |
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
| 85 |
-
},
|
| 86 |
-
},
|
| 87 |
-
+ { LLM_ARCH_FLUX, {}},
|
| 88 |
-
+ { LLM_ARCH_SD1, {}},
|
| 89 |
-
+ { LLM_ARCH_SDXL, {}},
|
| 90 |
-
+ { LLM_ARCH_SD3, {}},
|
| 91 |
-
+ { LLM_ARCH_AURA, {}},
|
| 92 |
-
+ { LLM_ARCH_LTXV, {}},
|
| 93 |
-
+ { LLM_ARCH_HYVID, {}},
|
| 94 |
-
+ { LLM_ARCH_WAN, {}},
|
| 95 |
-
+ { LLM_ARCH_HIDREAM, {}},
|
| 96 |
-
+ { LLM_ARCH_COSMOS, {}},
|
| 97 |
-
+ { LLM_ARCH_LUMINA2, {}},
|
| 98 |
-
+ { LLM_ARCH_QWEN_IMAGE, {}},
|
| 99 |
-
{
|
| 100 |
-
LLM_ARCH_UNKNOWN,
|
| 101 |
-
{
|
| 102 |
-
@@ -5403,6 +5439,26 @@ static void llm_load_hparams(
|
| 103 |
-
// get general kv
|
| 104 |
-
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
| 105 |
-
|
| 106 |
-
+ // Disable LLM metadata for image models
|
| 107 |
-
+ switch (model.arch) {
|
| 108 |
-
+ case LLM_ARCH_FLUX:
|
| 109 |
-
+ case LLM_ARCH_SD1:
|
| 110 |
-
+ case LLM_ARCH_SDXL:
|
| 111 |
-
+ case LLM_ARCH_SD3:
|
| 112 |
-
+ case LLM_ARCH_AURA:
|
| 113 |
-
+ case LLM_ARCH_LTXV:
|
| 114 |
-
+ case LLM_ARCH_HYVID:
|
| 115 |
-
+ case LLM_ARCH_WAN:
|
| 116 |
-
+ case LLM_ARCH_HIDREAM:
|
| 117 |
-
+ case LLM_ARCH_COSMOS:
|
| 118 |
-
+ case LLM_ARCH_LUMINA2:
|
| 119 |
-
+ case LLM_ARCH_QWEN_IMAGE:
|
| 120 |
-
+ model.ftype = ml.ftype;
|
| 121 |
-
+ return;
|
| 122 |
-
+ default:
|
| 123 |
-
+ break;
|
| 124 |
-
+ }
|
| 125 |
-
+
|
| 126 |
-
// get hparams kv
|
| 127 |
-
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
| 128 |
-
|
| 129 |
-
@@ -18016,6 +18072,158 @@ static void llama_tensor_dequantize_internal(
|
| 130 |
-
workers.clear();
|
| 131 |
-
}
|
| 132 |
-
|
| 133 |
-
+static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
| 134 |
-
+ // Special function for quantizing image model tensors
|
| 135 |
-
+ const std::string name = ggml_get_name(tensor);
|
| 136 |
-
+ const llm_arch arch = qs.model.arch;
|
| 137 |
-
+
|
| 138 |
-
+ // Sanity check
|
| 139 |
-
+ if (
|
| 140 |
-
+ (name.find("model.diffusion_model.") != std::string::npos) ||
|
| 141 |
-
+ (name.find("first_stage_model.") != std::string::npos) ||
|
| 142 |
-
+ (name.find("single_transformer_blocks.") != std::string::npos) ||
|
| 143 |
-
+ (name.find("joint_transformer_blocks.") != std::string::npos)
|
| 144 |
-
+ ) {
|
| 145 |
-
+ throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
|
| 146 |
-
+ }
|
| 147 |
-
+
|
| 148 |
-
+ // Unsupported quant types - exclude all IQ quants for now
|
| 149 |
-
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
| 150 |
-
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
| 151 |
-
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
| 152 |
-
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
| 153 |
-
+ ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
| 154 |
-
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
|
| 155 |
-
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
|
| 156 |
-
+ throw std::runtime_error("Invalid quantization type for image model (Not supported)");
|
| 157 |
-
+ }
|
| 158 |
-
+
|
| 159 |
-
+ if ( // Rules for to_v attention
|
| 160 |
-
+ (name.find("attn_v.weight") != std::string::npos) ||
|
| 161 |
-
+ (name.find(".to_v.weight") != std::string::npos) ||
|
| 162 |
-
+ (name.find(".v.weight") != std::string::npos) ||
|
| 163 |
-
+ (name.find(".attn.w1v.weight") != std::string::npos) ||
|
| 164 |
-
+ (name.find(".attn.w2v.weight") != std::string::npos) ||
|
| 165 |
-
+ (name.find(".add_v_proj.weight") != std::string::npos) ||
|
| 166 |
-
+ (name.find("_attn.v_proj.weight") != std::string::npos)
|
| 167 |
-
+ ){
|
| 168 |
-
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 169 |
-
+ new_type = GGML_TYPE_Q3_K;
|
| 170 |
-
+ }
|
| 171 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 172 |
-
+ new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 173 |
-
+ }
|
| 174 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
| 175 |
-
+ new_type = GGML_TYPE_Q5_K;
|
| 176 |
-
+ }
|
| 177 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
|
| 178 |
-
+ new_type = GGML_TYPE_Q6_K;
|
| 179 |
-
+ }
|
| 180 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
|
| 181 |
-
+ new_type = GGML_TYPE_Q5_K;
|
| 182 |
-
+ }
|
| 183 |
-
+ ++qs.i_attention_wv;
|
| 184 |
-
+ } else if ( // Rules for fused qkv attention
|
| 185 |
-
+ (name.find("attn_qkv.weight") != std::string::npos) ||
|
| 186 |
-
+ (name.find("attn.qkv.weight") != std::string::npos) ||
|
| 187 |
-
+ (name.find("attention.qkv.weight") != std::string::npos)
|
| 188 |
-
+ ) {
|
| 189 |
-
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
| 190 |
-
+ new_type = GGML_TYPE_Q4_K;
|
| 191 |
-
+ }
|
| 192 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 193 |
-
+ new_type = GGML_TYPE_Q5_K;
|
| 194 |
-
+ }
|
| 195 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
|
| 196 |
-
+ new_type = GGML_TYPE_Q6_K;
|
| 197 |
-
+ }
|
| 198 |
-
+ } else if ( // Rules for ffn
|
| 199 |
-
+ (name.find("ffn_down") != std::string::npos) ||
|
| 200 |
-
+ ((name.find("experts.") != std::string::npos) && (name.find(".w2.weight") != std::string::npos)) ||
|
| 201 |
-
+ (name.find(".ffn.2.weight") != std::string::npos) || // is this even the right way around?
|
| 202 |
-
+ (name.find(".ff.net.2.weight") != std::string::npos) ||
|
| 203 |
-
+ (name.find(".mlp.layer2.weight") != std::string::npos) ||
|
| 204 |
-
+ (name.find(".adaln_modulation_mlp.2.weight") != std::string::npos) ||
|
| 205 |
-
+ (name.find(".feed_forward.w2.weight") != std::string::npos) ||
|
| 206 |
-
+ (name.find(".img_mlp.net.2.weight") != std::string::npos) ||
|
| 207 |
-
+ (name.find(".txt_mlp.net.2.weight") != std::string::npos)
|
| 208 |
-
+ ) {
|
| 209 |
-
+ // TODO: add back `layer_info` with some model specific logic + logic further down
|
| 210 |
-
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 211 |
-
+ new_type = GGML_TYPE_Q4_K;
|
| 212 |
-
+ }
|
| 213 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
| 214 |
-
+ new_type = GGML_TYPE_Q5_K;
|
| 215 |
-
+ }
|
| 216 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
|
| 217 |
-
+ new_type = GGML_TYPE_Q5_K;
|
| 218 |
-
+ }
|
| 219 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 220 |
-
+ new_type = GGML_TYPE_Q6_K;
|
| 221 |
-
+ }
|
| 222 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
|
| 223 |
-
+ new_type = GGML_TYPE_Q6_K;
|
| 224 |
-
+ }
|
| 225 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
|
| 226 |
-
+ new_type = GGML_TYPE_Q4_1;
|
| 227 |
-
+ }
|
| 228 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
|
| 229 |
-
+ new_type = GGML_TYPE_Q5_1;
|
| 230 |
-
+ }
|
| 231 |
-
+ ++qs.i_ffn_down;
|
| 232 |
-
+ }
|
| 233 |
-
+
|
| 234 |
-
+ // first/last block high precision test
|
| 235 |
-
+ if (arch == LLM_ARCH_QWEN_IMAGE){
|
| 236 |
-
+ if (
|
| 237 |
-
+ (name.find("transformer_blocks.0.") != std::string::npos) ||
|
| 238 |
-
+ (name.find("transformer_blocks.59.") != std::string::npos) // this should be dynamic
|
| 239 |
-
+ ) {
|
| 240 |
-
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 241 |
-
+ new_type = GGML_TYPE_Q4_K;
|
| 242 |
-
+ }
|
| 243 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
| 244 |
-
+ new_type = GGML_TYPE_Q4_K;
|
| 245 |
-
+ }
|
| 246 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 247 |
-
+ new_type = GGML_TYPE_Q5_K;
|
| 248 |
-
+ }
|
| 249 |
-
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
|
| 250 |
-
+ new_type = GGML_TYPE_Q6_K;
|
| 251 |
-
+ }
|
| 252 |
-
+ }
|
| 253 |
-
+ }
|
| 254 |
-
+
|
| 255 |
-
+ // Sanity check for row shape
|
| 256 |
-
+ bool convert_incompatible_tensor = false;
|
| 257 |
-
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 258 |
-
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
| 259 |
-
+ int nx = tensor->ne[0];
|
| 260 |
-
+ int ny = tensor->ne[1];
|
| 261 |
-
+ if (nx % QK_K != 0) {
|
| 262 |
-
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
| 263 |
-
+ convert_incompatible_tensor = true;
|
| 264 |
-
+ } else {
|
| 265 |
-
+ ++qs.n_k_quantized;
|
| 266 |
-
+ }
|
| 267 |
-
+ }
|
| 268 |
-
+ if (convert_incompatible_tensor) {
|
| 269 |
-
+ // TODO: Possibly reenable this in the future
|
| 270 |
-
+ // switch (new_type) {
|
| 271 |
-
+ // case GGML_TYPE_Q2_K:
|
| 272 |
-
+ // case GGML_TYPE_Q3_K:
|
| 273 |
-
+ // case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
| 274 |
-
+ // case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
| 275 |
-
+ // case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
| 276 |
-
+ // default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
| 277 |
-
+ // }
|
| 278 |
-
+ new_type = GGML_TYPE_F16;
|
| 279 |
-
+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
| 280 |
-
+ ++qs.n_fallback;
|
| 281 |
-
+ }
|
| 282 |
-
+ return new_type;
|
| 283 |
-
+}
|
| 284 |
-
+
|
| 285 |
-
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
| 286 |
-
const std::string name = ggml_get_name(tensor);
|
| 287 |
-
|
| 288 |
-
@@ -18513,7 +18721,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
| 289 |
-
if (llama_model_has_encoder(&model)) {
|
| 290 |
-
n_attn_layer *= 3;
|
| 291 |
-
}
|
| 292 |
-
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
| 293 |
-
+ if (model.arch != LLM_ARCH_HYVID) { // TODO: Check why this fails
|
| 294 |
-
+ GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
| 295 |
-
+ }
|
| 296 |
-
}
|
| 297 |
-
|
| 298 |
-
size_t total_size_org = 0;
|
| 299 |
-
@@ -18547,6 +18757,57 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
| 300 |
-
ctx_outs[i_split] = gguf_init_empty();
|
| 301 |
-
}
|
| 302 |
-
gguf_add_tensor(ctx_outs[i_split], tensor);
|
| 303 |
-
+ // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
|
| 304 |
-
+ if (model.arch == LLM_ARCH_SD3) {
|
| 305 |
-
+ const std::string name = ggml_get_name(tensor);
|
| 306 |
-
+ if (name == "pos_embed" && tensor->ne[2] == 1) {
|
| 307 |
-
+ const int n_dim = 3;
|
| 308 |
-
+ gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
|
| 309 |
-
+ LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
|
| 310 |
-
+ }
|
| 311 |
-
+ }
|
| 312 |
-
+ // same goes for auraflow
|
| 313 |
-
+ if (model.arch == LLM_ARCH_AURA) {
|
| 314 |
-
+ const std::string name = ggml_get_name(tensor);
|
| 315 |
-
+ if (name == "positional_encoding" && tensor->ne[2] == 1) {
|
| 316 |
-
+ const int n_dim = 3;
|
| 317 |
-
+ gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
|
| 318 |
-
+ LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
|
| 319 |
-
+ }
|
| 320 |
-
+ if (name == "register_tokens" && tensor->ne[2] == 1) {
|
| 321 |
-
+ const int n_dim = 3;
|
| 322 |
-
+ gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
|
| 323 |
-
+ LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
|
| 324 |
-
+ }
|
| 325 |
-
+ }
|
| 326 |
-
+ // conv3d fails due to max dims - unsure what to do here as we never even reach this check
|
| 327 |
-
+ if (model.arch == LLM_ARCH_HYVID) {
|
| 328 |
-
+ const std::string name = ggml_get_name(tensor);
|
| 329 |
-
+ if (name == "img_in.proj.weight" && tensor->ne[5] != 1 ) {
|
| 330 |
-
+ throw std::runtime_error("img_in.proj.weight size failed for HyVid");
|
| 331 |
-
+ }
|
| 332 |
-
+ }
|
| 333 |
-
+ // All the modulation layers also have dim1, and I think conv3d fails here too but we segfaul way before that...
|
| 334 |
-
+ if (model.arch == LLM_ARCH_WAN) {
|
| 335 |
-
+ const std::string name = ggml_get_name(tensor);
|
| 336 |
-
+ if (name.find(".modulation") != std::string::npos && tensor->ne[2] == 1) {
|
| 337 |
-
+ const int n_dim = 3;
|
| 338 |
-
+ gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
|
| 339 |
-
+ LLAMA_LOG_INFO("\n%s: Correcting shape for Wan: [key:%s]\n", __func__, tensor->name);
|
| 340 |
-
+ }
|
| 341 |
-
+ // FLF2V model only
|
| 342 |
-
+ if (name == "img_emb.emb_pos") {
|
| 343 |
-
+ const int n_dim = 3;
|
| 344 |
-
+ gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
|
| 345 |
-
+ LLAMA_LOG_INFO("\n%s: Correcting shape for Wan FLF2V: [key:%s]\n", __func__, tensor->name);
|
| 346 |
-
+ }
|
| 347 |
-
+ // S2V model only
|
| 348 |
-
+ if (name == "casual_audio_encoder.weights" || name == "casual_audio_encoder.encoder.padding_tokens") {
|
| 349 |
-
+ const int n_dim = 4;
|
| 350 |
-
+ gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
|
| 351 |
-
+ LLAMA_LOG_INFO("\n%s: Correcting shape for Wan S2V: [key:%s]\n", __func__, tensor->name);
|
| 352 |
-
+ }
|
| 353 |
-
+ }
|
| 354 |
-
}
|
| 355 |
-
|
| 356 |
-
// Set split info if needed
|
| 357 |
-
@@ -18647,6 +18908,124 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
| 358 |
-
// do not quantize relative position bias (T5)
|
| 359 |
-
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
| 360 |
-
|
| 361 |
-
+ // rules for image models
|
| 362 |
-
+ bool image_model = false;
|
| 363 |
-
+ if (model.arch == LLM_ARCH_FLUX) {
|
| 364 |
-
+ image_model = true;
|
| 365 |
-
+ quantize &= name.find("txt_in.") == std::string::npos;
|
| 366 |
-
+ quantize &= name.find("img_in.") == std::string::npos;
|
| 367 |
-
+ quantize &= name.find("time_in.") == std::string::npos;
|
| 368 |
-
+ quantize &= name.find("vector_in.") == std::string::npos;
|
| 369 |
-
+ quantize &= name.find("guidance_in.") == std::string::npos;
|
| 370 |
-
+ quantize &= name.find("final_layer.") == std::string::npos;
|
| 371 |
-
+ }
|
| 372 |
-
+ if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
|
| 373 |
-
+ image_model = true;
|
| 374 |
-
+ quantize &= name.find("class_embedding.") == std::string::npos;
|
| 375 |
-
+ quantize &= name.find("time_embedding.") == std::string::npos;
|
| 376 |
-
+ quantize &= name.find("add_embedding.") == std::string::npos;
|
| 377 |
-
+ quantize &= name.find("time_embed.") == std::string::npos;
|
| 378 |
-
+ quantize &= name.find("label_emb.") == std::string::npos;
|
| 379 |
-
+ quantize &= name.find("conv_in.") == std::string::npos;
|
| 380 |
-
+ quantize &= name.find("conv_out.") == std::string::npos;
|
| 381 |
-
+ quantize &= name != "input_blocks.0.0.weight";
|
| 382 |
-
+ quantize &= name != "out.2.weight";
|
| 383 |
-
+ }
|
| 384 |
-
+ if (model.arch == LLM_ARCH_SD3) {
|
| 385 |
-
+ image_model = true;
|
| 386 |
-
+ quantize &= name.find("final_layer.") == std::string::npos;
|
| 387 |
-
+ quantize &= name.find("time_text_embed.") == std::string::npos;
|
| 388 |
-
+ quantize &= name.find("context_embedder.") == std::string::npos;
|
| 389 |
-
+ quantize &= name.find("t_embedder.") == std::string::npos;
|
| 390 |
-
+ quantize &= name.find("y_embedder.") == std::string::npos;
|
| 391 |
-
+ quantize &= name.find("x_embedder.") == std::string::npos;
|
| 392 |
-
+ quantize &= name != "proj_out.weight";
|
| 393 |
-
+ quantize &= name != "pos_embed";
|
| 394 |
-
+ }
|
| 395 |
-
+ if (model.arch == LLM_ARCH_AURA) {
|
| 396 |
-
+ image_model = true;
|
| 397 |
-
+ quantize &= name.find("t_embedder.") == std::string::npos;
|
| 398 |
-
+ quantize &= name.find("init_x_linear.") == std::string::npos;
|
| 399 |
-
+ quantize &= name != "modF.1.weight";
|
| 400 |
-
+ quantize &= name != "cond_seq_linear.weight";
|
| 401 |
-
+ quantize &= name != "final_linear.weight";
|
| 402 |
-
+ quantize &= name != "final_linear.weight";
|
| 403 |
-
+ quantize &= name != "positional_encoding";
|
| 404 |
-
+ quantize &= name != "register_tokens";
|
| 405 |
-
+ }
|
| 406 |
-
+ if (model.arch == LLM_ARCH_LTXV) {
|
| 407 |
-
+ image_model = true;
|
| 408 |
-
+ quantize &= name.find("adaln_single.") == std::string::npos;
|
| 409 |
-
+ quantize &= name.find("caption_projection.") == std::string::npos;
|
| 410 |
-
+ quantize &= name.find("patchify_proj.") == std::string::npos;
|
| 411 |
-
+ quantize &= name.find("proj_out.") == std::string::npos;
|
| 412 |
-
+ quantize &= name.find("scale_shift_table") == std::string::npos; // last block too
|
| 413 |
-
+ }
|
| 414 |
-
+ if (model.arch == LLM_ARCH_HYVID) {
|
| 415 |
-
+ image_model = true;
|
| 416 |
-
+ quantize &= name.find("txt_in.") == std::string::npos;
|
| 417 |
-
+ quantize &= name.find("img_in.") == std::string::npos;
|
| 418 |
-
+ quantize &= name.find("time_in.") == std::string::npos;
|
| 419 |
-
+ quantize &= name.find("vector_in.") == std::string::npos;
|
| 420 |
-
+ quantize &= name.find("guidance_in.") == std::string::npos;
|
| 421 |
-
+ quantize &= name.find("final_layer.") == std::string::npos;
|
| 422 |
-
+ }
|
| 423 |
-
+ if (model.arch == LLM_ARCH_WAN) {
|
| 424 |
-
+ image_model = true;
|
| 425 |
-
+ quantize &= name.find("modulation.") == std::string::npos;
|
| 426 |
-
+ quantize &= name.find("patch_embedding.") == std::string::npos;
|
| 427 |
-
+ quantize &= name.find("text_embedding.") == std::string::npos;
|
| 428 |
-
+ quantize &= name.find("time_projection.") == std::string::npos;
|
| 429 |
-
+ quantize &= name.find("time_embedding.") == std::string::npos;
|
| 430 |
-
+ quantize &= name.find("img_emb.") == std::string::npos;
|
| 431 |
-
+ quantize &= name.find("head.") == std::string::npos;
|
| 432 |
-
+ // S2V
|
| 433 |
-
+ quantize &= name.find("cond_encoder.") == std::string::npos;
|
| 434 |
-
+ quantize &= name.find("frame_packer.") == std::string::npos;
|
| 435 |
-
+ quantize &= name.find("audio_injector.") == std::string::npos;
|
| 436 |
-
+ quantize &= name.find("casual_audio_encoder.") == std::string::npos;
|
| 437 |
-
+ quantize &= name.find("trainable_cond_mask.") == std::string::npos;
|
| 438 |
-
+ }
|
| 439 |
-
+ if (model.arch == LLM_ARCH_HIDREAM) {
|
| 440 |
-
+ image_model = true;
|
| 441 |
-
+ quantize &= name.find("p_embedder.") == std::string::npos;
|
| 442 |
-
+ quantize &= name.find("t_embedder.") == std::string::npos;
|
| 443 |
-
+ quantize &= name.find("x_embedder.") == std::string::npos;
|
| 444 |
-
+ quantize &= name.find("final_layer.") == std::string::npos;
|
| 445 |
-
+ quantize &= name.find(".ff_i.gate.weight") == std::string::npos;
|
| 446 |
-
+ quantize &= name.find("caption_projection.") == std::string::npos;
|
| 447 |
-
+ }
|
| 448 |
-
+ if (model.arch == LLM_ARCH_COSMOS) {
|
| 449 |
-
+ image_model = true;
|
| 450 |
-
+ quantize &= name.find("p_embedder.") == std::string::npos;
|
| 451 |
-
+ quantize &= name.find("t_embedder.") == std::string::npos;
|
| 452 |
-
+ quantize &= name.find("t_embedding_norm.") == std::string::npos;
|
| 453 |
-
+ quantize &= name.find("x_embedder.") == std::string::npos;
|
| 454 |
-
+ quantize &= name.find("pos_embedder.") == std::string::npos;
|
| 455 |
-
+ quantize &= name.find("final_layer.") == std::string::npos;
|
| 456 |
-
+ }
|
| 457 |
-
+ if (model.arch == LLM_ARCH_LUMINA2) {
|
| 458 |
-
+ image_model = true;
|
| 459 |
-
+ quantize &= name.find("t_embedder.") == std::string::npos;
|
| 460 |
-
+ quantize &= name.find("x_embedder.") == std::string::npos;
|
| 461 |
-
+ quantize &= name.find("final_layer.") == std::string::npos;
|
| 462 |
-
+ quantize &= name.find("cap_embedder.") == std::string::npos;
|
| 463 |
-
+ quantize &= name.find("context_refiner.") == std::string::npos;
|
| 464 |
-
+ quantize &= name.find("noise_refiner.") == std::string::npos;
|
| 465 |
-
+ }
|
| 466 |
-
+ if (model.arch == LLM_ARCH_QWEN_IMAGE) {
|
| 467 |
-
+ image_model = true;
|
| 468 |
-
+ quantize &= name.find("img_in.") == std::string::npos;
|
| 469 |
-
+ quantize &= name.find("txt_in.") == std::string::npos;
|
| 470 |
-
+ quantize &= name.find("time_text_embed.") == std::string::npos;
|
| 471 |
-
+ quantize &= name.find("proj_out.") == std::string::npos;
|
| 472 |
-
+ quantize &= name.find("norm_out.") == std::string::npos;
|
| 473 |
-
+ }
|
| 474 |
-
+ // ignore 3D/4D tensors for image models as the code was never meant to handle these
|
| 475 |
-
+ if (image_model) {
|
| 476 |
-
+ quantize &= ggml_n_dims(tensor) == 2;
|
| 477 |
-
+ }
|
| 478 |
-
+
|
| 479 |
-
enum ggml_type new_type;
|
| 480 |
-
void * new_data;
|
| 481 |
-
size_t new_size;
|
| 482 |
-
@@ -18655,6 +19034,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
| 483 |
-
new_type = default_type;
|
| 484 |
-
|
| 485 |
-
// get more optimal quantization type based on the tensor shape, layer, etc.
|
| 486 |
-
+ if (image_model) {
|
| 487 |
-
+ new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
|
| 488 |
-
+ } else {
|
| 489 |
-
if (!params->pure && ggml_is_quantized(default_type)) {
|
| 490 |
-
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
| 491 |
-
}
|
| 492 |
-
@@ -18664,6 +19046,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
| 493 |
-
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
| 494 |
-
new_type = params->output_tensor_type;
|
| 495 |
-
}
|
| 496 |
-
+ }
|
| 497 |
-
|
| 498 |
-
// If we've decided to quantize to the same type the tensor is already
|
| 499 |
-
// in then there's nothing to do.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/read_tensors.py
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/python3
|
| 2 |
-
import os
|
| 3 |
-
import sys
|
| 4 |
-
import gguf
|
| 5 |
-
|
| 6 |
-
def read_tensors(path):
|
| 7 |
-
reader = gguf.GGUFReader(path)
|
| 8 |
-
for tensor in reader.tensors:
|
| 9 |
-
if tensor.tensor_type == gguf.GGMLQuantizationType.F32:
|
| 10 |
-
continue
|
| 11 |
-
print(f"{str(tensor.tensor_type):32}: {tensor.name}")
|
| 12 |
-
|
| 13 |
-
try:
|
| 14 |
-
path = sys.argv[1]
|
| 15 |
-
assert os.path.isfile(path), "Invalid path"
|
| 16 |
-
print(f"input: {path}")
|
| 17 |
-
except Exception as e:
|
| 18 |
-
input(f"failed: {e}")
|
| 19 |
-
else:
|
| 20 |
-
read_tensors(path)
|
| 21 |
-
input()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/tool_auto.py
DELETED
|
@@ -1,374 +0,0 @@
|
|
| 1 |
-
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
| 2 |
-
import os
|
| 3 |
-
import re
|
| 4 |
-
import sys
|
| 5 |
-
import time
|
| 6 |
-
import torch
|
| 7 |
-
import logging
|
| 8 |
-
import argparse
|
| 9 |
-
import subprocess
|
| 10 |
-
import huggingface_hub as hf
|
| 11 |
-
|
| 12 |
-
logging.getLogger().setLevel(logging.DEBUG)
|
| 13 |
-
|
| 14 |
-
qtypes =[
|
| 15 |
-
# "F16", "BF16",
|
| 16 |
-
"Q8_0", "Q6_K",
|
| 17 |
-
"Q5_K_M", "Q5_K_S", "Q5_1", "Q5_0",
|
| 18 |
-
"Q4_K_M", "Q4_K_S", "Q4_1", "Q4_0",
|
| 19 |
-
"Q3_K_M", "Q3_K_S", "Q2_K"
|
| 20 |
-
]
|
| 21 |
-
|
| 22 |
-
dtype_dict = {
|
| 23 |
-
"F32": torch.float32,
|
| 24 |
-
"F16": torch.float16,
|
| 25 |
-
"BF16": torch.bfloat16,
|
| 26 |
-
"F8_E4M3": getattr(torch, "float8_e4m3fn", "_invalid"),
|
| 27 |
-
"F8_E5M2": getattr(torch, "float8_e5m2", "_invalid"),
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
# this is pretty jank but I want to be able to run it on a blank instance w/o setup
|
| 31 |
-
terraform_dict = {
|
| 32 |
-
"repo": "city96/ComfyUI-GGUF",
|
| 33 |
-
"target": "auto_convert",
|
| 34 |
-
"lcpp_repo": "ggerganov/llama.cpp",
|
| 35 |
-
"lcpp_target": "tags/b3962",
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
def get_args():
|
| 39 |
-
parser = argparse.ArgumentParser()
|
| 40 |
-
parser.add_argument("--src", required=True, help="Source model file or huggingface repo name")
|
| 41 |
-
parser.add_argument("--quants", nargs="+", choices=["all", "base", *qtypes], default=["Q8_0"])
|
| 42 |
-
parser.add_argument("--output-dir", default=None, help="Location for output files, defaults to current dir or ComfyUI model dir.")
|
| 43 |
-
parser.add_argument("--temp-dir", default=None, help="Location for temp files, defaults to [output_dir]/tmp")
|
| 44 |
-
parser.add_argument("--force-update", action="store_true", help="Force update & rebuild entire quantization stack.")
|
| 45 |
-
parser.add_argument("--resume", action="store_true", help="Skip over existing files. Will NOT check for broken/interrupted files.")
|
| 46 |
-
|
| 47 |
-
args = parser.parse_args()
|
| 48 |
-
if args.output_dir is None:
|
| 49 |
-
args.output_dir = get_output_dir()
|
| 50 |
-
if args.temp_dir is None:
|
| 51 |
-
args.temp_dir = os.path.join(args.output_dir, "tmp")
|
| 52 |
-
|
| 53 |
-
if os.path.isdir(args.temp_dir) and len(os.listdir(args.temp_dir)) > 0:
|
| 54 |
-
raise OSError("Output temp folder not empty!")
|
| 55 |
-
|
| 56 |
-
if "all" in args.quants:
|
| 57 |
-
args.quants = ["base", *qtypes]
|
| 58 |
-
|
| 59 |
-
return args
|
| 60 |
-
|
| 61 |
-
def run_cmd(*args, log_error=False):
|
| 62 |
-
logging.debug(f"cmd: {args}")
|
| 63 |
-
try:
|
| 64 |
-
log = subprocess.run(args, capture_output=True, text=True)
|
| 65 |
-
except Exception as e:
|
| 66 |
-
logging.warning(f"{args[0]}, {e}")
|
| 67 |
-
return -1
|
| 68 |
-
if log.returncode != 0 and log_error:
|
| 69 |
-
logging.warning(f"{args[0]}: {log.stdout} {log.stderr}")
|
| 70 |
-
else:
|
| 71 |
-
logging.debug(f"{args[0]}: {repr(log.stdout)} {repr(log.stderr.strip())} RET:{log.returncode}")
|
| 72 |
-
return log.returncode
|
| 73 |
-
|
| 74 |
-
def setup_utils(force_update=False):
|
| 75 |
-
# get ComfyUI-GGUF if missing, then compile patched llama.cpp if required
|
| 76 |
-
root = os.path.dirname(os.path.abspath(__file__))
|
| 77 |
-
root = os.path.normpath(root)
|
| 78 |
-
|
| 79 |
-
if os.path.split(root)[1] != "tools":
|
| 80 |
-
cg_dir = os.path.join(root, "ComfyUI-GGUF")
|
| 81 |
-
if not os.path.isdir(cg_dir):
|
| 82 |
-
logging.warning(f"Running outside tools folder! Cloning to {cg_dir}")
|
| 83 |
-
run_cmd("git", "clone", f"https://github.com/{terraform_dict['repo']}", cg_dir)
|
| 84 |
-
need_update = True
|
| 85 |
-
else:
|
| 86 |
-
need_update = False
|
| 87 |
-
|
| 88 |
-
if force_update or need_update:
|
| 89 |
-
if terraform_dict['target']:
|
| 90 |
-
logging.info(f"Attemtping to check out ComfyUI-GGUF branch {terraform_dict['target']}")
|
| 91 |
-
run_cmd("git", "-C", cg_dir, "checkout", terraform_dict['target'])
|
| 92 |
-
|
| 93 |
-
logging.info("Attemtping to git pull ComfyUI-GGUF to latest")
|
| 94 |
-
run_cmd("git", "-C", cg_dir, "pull")
|
| 95 |
-
|
| 96 |
-
tools_dir = os.path.join(root, "ComfyUI-GGUF", "tools")
|
| 97 |
-
sys.path.append(tools_dir) # to make import(s) work
|
| 98 |
-
else:
|
| 99 |
-
# TODO: Git pull here too?
|
| 100 |
-
logging.warning(f"Assuming latest ComfyUI-GGUF. Please git pull & check out branch {terraform_dict['target']} manually!")
|
| 101 |
-
tools_dir = root
|
| 102 |
-
|
| 103 |
-
if not os.path.isdir(tools_dir):
|
| 104 |
-
raise OSError(f"Can't find tools subfoder in ComfyUI-GGUF at {tools_dir}")
|
| 105 |
-
|
| 106 |
-
convert_path = os.path.join(tools_dir, "convert.py")
|
| 107 |
-
if not os.path.isfile(convert_path):
|
| 108 |
-
raise OSError(f"Cannot find convert.py at location: {convert_path}")
|
| 109 |
-
|
| 110 |
-
lcpp_path = os.path.join(root, "llama.cpp.auto") # avoid messing with regular dir
|
| 111 |
-
if not os.path.isdir(lcpp_path):
|
| 112 |
-
logging.info(f"Attemtping to clone llama.cpp repo to {lcpp_path}")
|
| 113 |
-
run_cmd("git", "clone", f"https://github.com/{terraform_dict['lcpp_repo']}", lcpp_path)
|
| 114 |
-
need_update = True
|
| 115 |
-
else:
|
| 116 |
-
need_update = False
|
| 117 |
-
|
| 118 |
-
if force_update or need_update:
|
| 119 |
-
# TODO: check reflog and/or git reset before checkout?
|
| 120 |
-
logging.info(f"Attemtping to check out llama.cpp target {terraform_dict['lcpp_target']}")
|
| 121 |
-
run_cmd("git", "-C", lcpp_path, "checkout", terraform_dict['lcpp_target'])
|
| 122 |
-
|
| 123 |
-
# TODO: git reset before patch?
|
| 124 |
-
patch_path = os.path.join(tools_dir, "lcpp.patch")
|
| 125 |
-
# patch (probably) has wrong file endings:
|
| 126 |
-
logging.info("Converting patch file endings")
|
| 127 |
-
with open(patch_path, "rb") as file:
|
| 128 |
-
content = file.read().replace(b"\r\n", b"\n")
|
| 129 |
-
with open(patch_path, "wb") as file:
|
| 130 |
-
file.write(content)
|
| 131 |
-
|
| 132 |
-
if run_cmd("git", "-C", lcpp_path, "apply", "--check", "-R", patch_path) != 0:
|
| 133 |
-
logging.info("Attemtping to apply patch to llama.cpp repo")
|
| 134 |
-
run_cmd("git", "-C", lcpp_path, "apply", patch_path)
|
| 135 |
-
else:
|
| 136 |
-
logging.info("Patch already applied")
|
| 137 |
-
|
| 138 |
-
# using cmake here as llama.cpp switched to it completely for new versions
|
| 139 |
-
if os.name == "nt":
|
| 140 |
-
bin_path = os.path.join(lcpp_path, "build", "bin", "debug", "llama-quantize.exe")
|
| 141 |
-
else:
|
| 142 |
-
bin_path = os.path.join(lcpp_path, "build", "bin", "llama-quantize")
|
| 143 |
-
|
| 144 |
-
if not os.path.isfile(bin_path) or force_update or need_update:
|
| 145 |
-
if run_cmd("cmake", "--version") != 0:
|
| 146 |
-
raise RuntimeError("Can't find cmake! Make sure you have a working build environment set up")
|
| 147 |
-
|
| 148 |
-
build_path = os.path.join(lcpp_path, "build")
|
| 149 |
-
os.makedirs(build_path, exist_ok=True)
|
| 150 |
-
logging.info("Attempting to build llama.cpp binary from source")
|
| 151 |
-
run_cmd("cmake", "-B", build_path, lcpp_path)
|
| 152 |
-
run_cmd("cmake", "--build", build_path, "--config", "Debug", "-j4", "--target", "llama-quantize")
|
| 153 |
-
if not os.path.isfile(bin_path):
|
| 154 |
-
raise RuntimeError("Build failed! Rerun with --debug to see error log.")
|
| 155 |
-
else:
|
| 156 |
-
logging.info("Binary already present")
|
| 157 |
-
|
| 158 |
-
return bin_path
|
| 159 |
-
|
| 160 |
-
def get_output_dir():
|
| 161 |
-
root = os.path.dirname(os.path.abspath(__file__))
|
| 162 |
-
root = os.path.normpath(root)
|
| 163 |
-
split = os.path.split(root)
|
| 164 |
-
while split[1]:
|
| 165 |
-
if split[1] == "ComfyUI":
|
| 166 |
-
if os.path.isdir(os.path.join(*split, "models", "unet")): # new
|
| 167 |
-
root = os.path.join(*split, "models", "unet", "gguf")
|
| 168 |
-
logging.info(f"Found ComfyUI, using model folder: {root}")
|
| 169 |
-
return root
|
| 170 |
-
|
| 171 |
-
if os.path.isdir(os.path.join(*split, "models", "diffusion_models")): # old
|
| 172 |
-
root = os.path.join(*split, "models", "diffusion_models", "gguf")
|
| 173 |
-
logging.info(f"Found ComfyUI, using model folder: {root}")
|
| 174 |
-
return root
|
| 175 |
-
|
| 176 |
-
logging.info("Found ComfyUI, but can't find model folder")
|
| 177 |
-
break
|
| 178 |
-
|
| 179 |
-
split = os.path.split(split[0])
|
| 180 |
-
|
| 181 |
-
root = os.path.join(root, "models")
|
| 182 |
-
logging.info(f"Defaulting to [script dir]/models: {root}")
|
| 183 |
-
return root
|
| 184 |
-
|
| 185 |
-
def get_hf_fake_sd(repo, path, device=torch.device("meta")):
|
| 186 |
-
sd = {}
|
| 187 |
-
meta = hf.parse_safetensors_file_metadata(repo, path)
|
| 188 |
-
for key, raw in meta.tensors.items():
|
| 189 |
-
shape = tuple(raw.shape)
|
| 190 |
-
dtype = dtype_dict.get(raw.dtype, torch.float32)
|
| 191 |
-
sd[key] = torch.zeros(shape, dtype=dtype, device=device)
|
| 192 |
-
return sd
|
| 193 |
-
|
| 194 |
-
def get_hf_file_arch(repo, path):
|
| 195 |
-
pattern = r'(\d+)-of-(\d+)'
|
| 196 |
-
match = re.search(pattern, path)
|
| 197 |
-
|
| 198 |
-
if match:
|
| 199 |
-
# we need to load it as multipart
|
| 200 |
-
if int(match.group(1)) != 1:
|
| 201 |
-
return None
|
| 202 |
-
sd = {}
|
| 203 |
-
for k in range(int(match.group(2))):
|
| 204 |
-
shard_path = path.replace(match.group(1), f"{k+1:0{len(match.group(1))}}")
|
| 205 |
-
sd.update(get_hf_fake_sd(repo, shard_path))
|
| 206 |
-
else:
|
| 207 |
-
sd = get_hf_fake_sd(repo, path)
|
| 208 |
-
|
| 209 |
-
# this should raise an error on failure
|
| 210 |
-
sd = strip_prefix(sd)
|
| 211 |
-
model_arch = detect_arch(sd)
|
| 212 |
-
|
| 213 |
-
# this is for SDXL and SD1.5, I want to overhaul this logic to match sd.cpp eventually
|
| 214 |
-
assert not model_arch.shape_fix, "Model uses shape fix (SDXL/SD1) - unsupported for now."
|
| 215 |
-
return model_arch.arch
|
| 216 |
-
|
| 217 |
-
def get_hf_valid_files(repo):
|
| 218 |
-
# TODO: probably tweak this?
|
| 219 |
-
MIN_SIZE_GB = 1
|
| 220 |
-
VALID_SRC_EXTS = [".safetensors", ] # ".pt", ".ckpt", ]
|
| 221 |
-
meta = hf.model_info(repo, files_metadata=True)
|
| 222 |
-
|
| 223 |
-
valid = {}
|
| 224 |
-
for file in meta.siblings:
|
| 225 |
-
path = file.rfilename
|
| 226 |
-
fname = os.path.basename(path)
|
| 227 |
-
name, ext = os.path.splitext(fname)
|
| 228 |
-
|
| 229 |
-
if ext.lower() not in VALID_SRC_EXTS:
|
| 230 |
-
logging.debug(f"Invalid ext: {path} {ext}")
|
| 231 |
-
continue
|
| 232 |
-
|
| 233 |
-
if file.size / (1024 ** 3) < MIN_SIZE_GB:
|
| 234 |
-
logging.debug(f"File too small: {path} {file.size}")
|
| 235 |
-
continue
|
| 236 |
-
|
| 237 |
-
try:
|
| 238 |
-
arch = get_hf_file_arch(repo, path)
|
| 239 |
-
except Exception as e:
|
| 240 |
-
logging.warning(f"Arch detect fail: {e} ({path})")
|
| 241 |
-
else:
|
| 242 |
-
if arch is not None:
|
| 243 |
-
valid[path] = arch
|
| 244 |
-
logging.info(f"Found '{arch}' model at path {path}")
|
| 245 |
-
return valid
|
| 246 |
-
|
| 247 |
-
def make_base_quant(src, output_dir, temp_dir, final=True, resume=True):
|
| 248 |
-
name, ext = os.path.splitext(os.path.basename(src))
|
| 249 |
-
if ext == ".gguf":
|
| 250 |
-
logging.info("Input file already in gguf, assuming base quant")
|
| 251 |
-
return None, src, None
|
| 252 |
-
|
| 253 |
-
name = name.lower() # uncomment to preserve case in all quants
|
| 254 |
-
dst_tmp = os.path.join(temp_dir, f"{name}-{{ftype}}.gguf") # ftype is filled in by convert.py
|
| 255 |
-
|
| 256 |
-
tmp_path, model_arch, fix_path = convert_file(src, dst_tmp, interact=False, overwrite=False)
|
| 257 |
-
dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
|
| 258 |
-
if os.path.isfile(dst_path):
|
| 259 |
-
if resume:
|
| 260 |
-
logging.warning("Resuming with interrupted base quant, may be incorrect!")
|
| 261 |
-
return dst_path, tmp_path, fix_path
|
| 262 |
-
raise OSError(f"Output already exists! Clear folder? {dst_path}")
|
| 263 |
-
|
| 264 |
-
if fix_path is not None and os.path.isfile(fix_path):
|
| 265 |
-
quant_source = tmp_path
|
| 266 |
-
if final:
|
| 267 |
-
apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
|
| 268 |
-
else:
|
| 269 |
-
dst_path = None
|
| 270 |
-
else:
|
| 271 |
-
fix_path = None
|
| 272 |
-
if final:
|
| 273 |
-
os.rename(tmp_path, dst_path)
|
| 274 |
-
quant_source = dst_path
|
| 275 |
-
else:
|
| 276 |
-
dst_path = None
|
| 277 |
-
quant_source = tmp_path
|
| 278 |
-
|
| 279 |
-
return dst_path, quant_source, fix_path
|
| 280 |
-
|
| 281 |
-
def make_quant(src, output_dir, temp_dir, qtype, quantize_binary, fix_path=None, resume=True):
|
| 282 |
-
name, ext = os.path.splitext(os.path.basename(src))
|
| 283 |
-
assert ext.lower() == ".gguf", "Invalid input file"
|
| 284 |
-
|
| 285 |
-
src_qtext = [x for x in ["-F32.gguf", "-F16.gguf", "-BF16.gguf"] if x in src]
|
| 286 |
-
if len(src_qtext) == 1:
|
| 287 |
-
tmp_path = os.path.join(
|
| 288 |
-
temp_dir,
|
| 289 |
-
os.path.basename(src).replace(src_qtext[0], f"-{qtype.upper()}.gguf")
|
| 290 |
-
)
|
| 291 |
-
else:
|
| 292 |
-
tmp_path = os.path.join(
|
| 293 |
-
temp_dir,
|
| 294 |
-
f"{name}-{qtype.upper()}.gguf"
|
| 295 |
-
)
|
| 296 |
-
tmp_path = os.path.abspath(tmp_path)
|
| 297 |
-
dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
|
| 298 |
-
if os.path.isfile(dst_path):
|
| 299 |
-
if resume:
|
| 300 |
-
return dst_path
|
| 301 |
-
raise OSError("Output already exists! Clear folder?")
|
| 302 |
-
|
| 303 |
-
r = run_cmd(quantize_binary, src, tmp_path, qtype, log_error=True)
|
| 304 |
-
time.sleep(2) # leave time for file sync?
|
| 305 |
-
if r != 0:
|
| 306 |
-
raise RuntimeError(f"Quantization failed with error code {r}")
|
| 307 |
-
|
| 308 |
-
if fix_path is not None:
|
| 309 |
-
apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
|
| 310 |
-
if os.path.isfile(dst_path) and os.path.isfile(tmp_path):
|
| 311 |
-
os.remove(tmp_path)
|
| 312 |
-
else:
|
| 313 |
-
os.rename(tmp_path, dst_path)
|
| 314 |
-
|
| 315 |
-
return dst_path
|
| 316 |
-
|
| 317 |
-
if __name__ == "__main__":
|
| 318 |
-
args = get_args()
|
| 319 |
-
os.makedirs(args.output_dir, exist_ok=True)
|
| 320 |
-
os.makedirs(args.temp_dir, exist_ok=True)
|
| 321 |
-
quantize_binary = setup_utils(args.force_update)
|
| 322 |
-
|
| 323 |
-
try:
|
| 324 |
-
from convert import detect_arch, strip_prefix, convert_file
|
| 325 |
-
from fix_5d_tensors import apply_5d_fix
|
| 326 |
-
except [ImportError, ModuleNotFoundError] as e:
|
| 327 |
-
raise ImportError(f"Can't import required utils: {e}")
|
| 328 |
-
|
| 329 |
-
if not os.path.isfile(args.src):
|
| 330 |
-
# huggingface repo. TODO: file choice
|
| 331 |
-
if len(args.src.split("/")) != "1":
|
| 332 |
-
raise OSError(f"Invalid huggingface repo or model path {args.src}")
|
| 333 |
-
raise NotImplementedError("HF not yet supported")
|
| 334 |
-
# download then set to temp file
|
| 335 |
-
# hf_repo = "Lightricks/LTX-Video" # "fal/AuraFlow-v0.3"
|
| 336 |
-
# get_hf_valid_files(hf_repo)
|
| 337 |
-
# args.src = ...
|
| 338 |
-
|
| 339 |
-
out_files = []
|
| 340 |
-
|
| 341 |
-
base_quant, quant_source, fix_path = make_base_quant(
|
| 342 |
-
args.src,
|
| 343 |
-
args.output_dir,
|
| 344 |
-
args.temp_dir,
|
| 345 |
-
final=("base" in args.quants),
|
| 346 |
-
resume=args.resume,
|
| 347 |
-
)
|
| 348 |
-
if "base" in args.quants:
|
| 349 |
-
args.quants = [x for x in args.quants if x not in ["base"]]
|
| 350 |
-
if base_quant is not None:
|
| 351 |
-
out_files.append(base_quant)
|
| 352 |
-
|
| 353 |
-
for qtype in args.quants:
|
| 354 |
-
out_files.append(make_quant(
|
| 355 |
-
quant_source,
|
| 356 |
-
args.output_dir,
|
| 357 |
-
args.temp_dir,
|
| 358 |
-
qtype,
|
| 359 |
-
quantize_binary,
|
| 360 |
-
fix_path,
|
| 361 |
-
resume=args.resume,
|
| 362 |
-
))
|
| 363 |
-
|
| 364 |
-
if fix_path is not None and os.path.isfile(fix_path):
|
| 365 |
-
os.remove(fix_path)
|
| 366 |
-
|
| 367 |
-
if base_quant != quant_source:
|
| 368 |
-
# make sure our quant source is in the temp folder before removing it
|
| 369 |
-
cc = os.path.commonpath([os.path.normpath(quant_source), os.path.normpath(args.temp_dir)])
|
| 370 |
-
if cc == os.path.normpath(args.temp_dir):
|
| 371 |
-
os.remove(quant_source)
|
| 372 |
-
|
| 373 |
-
out_file_str = '\n'.join(out_files)
|
| 374 |
-
logging.info(f"Output file(s): {out_file_str}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|