Alissonerdx commited on
Commit
8943c32
·
verified ·
1 Parent(s): df39cb8

Delete tools

Browse files
tools/convert.py DELETED
@@ -1,412 +0,0 @@
1
- # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
2
- import os
3
- import gguf
4
- import torch
5
- import logging
6
- import argparse
7
- from tqdm import tqdm
8
- from safetensors.torch import load_file, save_file
9
-
10
- QUANTIZATION_THRESHOLD = 1024
11
- REARRANGE_THRESHOLD = 512
12
- MAX_TENSOR_NAME_LENGTH = 127
13
- MAX_TENSOR_DIMS = 4
14
-
15
- class ModelTemplate:
16
- arch = "invalid" # string describing architecture
17
- shape_fix = False # whether to reshape tensors
18
- ndims_fix = False # whether to save fix file for tensors exceeding max dims
19
- keys_detect = [] # list of lists to match in state dict
20
- keys_banned = [] # list of keys that should mark model as invalid for conversion
21
- keys_hiprec = [] # list of keys that need to be kept in fp32 for some reason
22
- keys_ignore = [] # list of strings to ignore keys by when found
23
-
24
- class ModelFlux(ModelTemplate):
25
- arch = "flux"
26
- keys_detect = [
27
- ("single_transformer_blocks.0.attn.norm_k.weight",),
28
- ("double_blocks.0.img_attn.proj.weight",),
29
- ]
30
- keys_banned = ["single_transformer_blocks.0.attn.norm_k.weight",]
31
-
32
- class ModelSD3(ModelTemplate):
33
- arch = "sd3"
34
- keys_detect = [
35
- ("transformer_blocks.0.ff_context.net.0.proj.weight",),
36
- ("joint_blocks.0.x_block.attn.qkv.weight",),
37
- ]
38
- keys_banned = ["transformer_blocks.0.ff_context.net.0.proj.weight",]
39
-
40
- class ModelAura(ModelTemplate):
41
- arch = "aura"
42
- keys_detect = [
43
- ("double_layers.3.modX.1.weight",),
44
- ("joint_transformer_blocks.3.ff_context.out_projection.weight",),
45
- ]
46
- keys_banned = ["joint_transformer_blocks.3.ff_context.out_projection.weight",]
47
-
48
- class ModelHiDream(ModelTemplate):
49
- arch = "hidream"
50
- keys_detect = [
51
- (
52
- "caption_projection.0.linear.weight",
53
- "double_stream_blocks.0.block.ff_i.shared_experts.w3.weight"
54
- )
55
- ]
56
- keys_hiprec = [
57
- # nn.parameter, can't load from BF16 ver
58
- ".ff_i.gate.weight",
59
- "img_emb.emb_pos"
60
- ]
61
-
62
- class ModelCosmosPredict2(ModelTemplate):
63
- arch = "cosmos"
64
- keys_detect = [
65
- (
66
- "blocks.0.mlp.layer1.weight",
67
- "blocks.0.adaln_modulation_cross_attn.1.weight",
68
- )
69
- ]
70
- keys_hiprec = ["pos_embedder"]
71
- keys_ignore = ["_extra_state", "accum_"]
72
-
73
- class ModelQwenImage(ModelTemplate):
74
- arch = "qwen_image"
75
- keys_detect = [
76
- (
77
- "time_text_embed.timestep_embedder.linear_2.weight",
78
- "transformer_blocks.0.attn.norm_added_q.weight",
79
- "transformer_blocks.0.img_mlp.net.0.proj.weight",
80
- )
81
- ]
82
-
83
- class ModelHyVid(ModelTemplate):
84
- arch = "hyvid"
85
- ndims_fix = True
86
- keys_detect = [
87
- (
88
- "double_blocks.0.img_attn_proj.weight",
89
- "txt_in.individual_token_refiner.blocks.1.self_attn_qkv.weight",
90
- )
91
- ]
92
-
93
- class ModelWan(ModelTemplate):
94
- arch = "wan"
95
- ndims_fix = True
96
- keys_detect = [
97
- (
98
- "blocks.0.self_attn.norm_q.weight",
99
- "text_embedding.2.weight",
100
- "head.modulation",
101
- )
102
- ]
103
- keys_hiprec = [
104
- ".modulation", # nn.parameter, can't load from BF16 ver
105
- ".encoder.padding_tokens", # nn.parameter, specific to S2V
106
- "trainable_cond_mask", # used directly w/ .weight
107
- "casual_audio_encoder.weights", # nn.parameter, specific to S2V
108
- "casual_audio_encoder.encoder.conv", # CausalConv1d doesn't use ops.py for now
109
- ]
110
-
111
- class ModelLTXV(ModelTemplate):
112
- arch = "ltxv"
113
- keys_detect = [
114
- (
115
- "adaln_single.emb.timestep_embedder.linear_2.weight",
116
- "transformer_blocks.27.scale_shift_table",
117
- "caption_projection.linear_2.weight",
118
- )
119
- ]
120
- keys_hiprec = [
121
- "scale_shift_table" # nn.parameter, can't load from BF16 base quant
122
- ]
123
-
124
- class ModelSDXL(ModelTemplate):
125
- arch = "sdxl"
126
- shape_fix = True
127
- keys_detect = [
128
- ("down_blocks.0.downsamplers.0.conv.weight", "add_embedding.linear_1.weight",),
129
- (
130
- "input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight",
131
- "output_blocks.2.2.conv.weight", "output_blocks.5.2.conv.weight",
132
- ), # Non-diffusers
133
- ("label_emb.0.0.weight",),
134
- ]
135
-
136
- class ModelSD1(ModelTemplate):
137
- arch = "sd1"
138
- shape_fix = True
139
- keys_detect = [
140
- ("down_blocks.0.downsamplers.0.conv.weight",),
141
- (
142
- "input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight", "input_blocks.9.0.op.weight",
143
- "output_blocks.2.1.conv.weight", "output_blocks.5.2.conv.weight", "output_blocks.8.2.conv.weight"
144
- ), # Non-diffusers
145
- ]
146
-
147
- class ModelLumina2(ModelTemplate):
148
- arch = "lumina2"
149
- keys_detect = [
150
- ("cap_embedder.1.weight", "context_refiner.0.attention.qkv.weight")
151
- ]
152
-
153
- class ModelHuMo(ModelTemplate):
154
- arch = "humo"
155
- ndims_fix = True
156
- keys_detect = [
157
- ("blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",),
158
- ("audio_proj.audio_proj_glob_1.layer.weight",),
159
- (
160
- "blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",
161
- "blocks.0.self_attn.norm_q.weight",
162
- "text_embedding.2.weight",
163
- "head.modulation"
164
- ),
165
- ]
166
- keys_hiprec = ["patch_embedding", "text_embedding", "time_embedding", ".modulation"]
167
-
168
- # The architectures are checked in order and the first successful match terminates the search.
169
- arch_list = [
170
- ModelFlux, ModelSD3, ModelAura, ModelHiDream, ModelCosmosPredict2, ModelQwenImage,
171
- ModelLTXV, ModelHyVid, ModelHuMo, ModelWan, ModelSDXL, ModelSD1, ModelLumina2
172
- ]
173
-
174
- def is_model_arch(model, state_dict):
175
- # check if model is correct
176
- matched = False
177
- invalid = False
178
- # print(state_dict)
179
- for match_list in model.keys_detect:
180
- if all(key in state_dict for key in match_list):
181
- matched = True
182
- invalid = any(key in state_dict for key in model.keys_banned)
183
- break
184
- assert not invalid, f"Model architecture not allowed for conversion! (i.e. reference VS diffusers format) [arch:{model.arch}]"
185
- return matched
186
-
187
- def detect_arch(state_dict):
188
- model_arch = None
189
- for arch in arch_list:
190
- if is_model_arch(arch, state_dict):
191
- model_arch = arch()
192
- break
193
- assert model_arch is not None, "Unknown model architecture!"
194
- return model_arch
195
-
196
- def parse_args():
197
- parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
198
- parser.add_argument("--src", required=True, help="Source model ckpt file.")
199
- parser.add_argument("--dst", help="Output unet gguf file.")
200
- args = parser.parse_args()
201
-
202
- if not os.path.isfile(args.src):
203
- parser.error("No input provided!")
204
-
205
- return args
206
-
207
- def strip_prefix(state_dict):
208
- # prefix for mixed state dict
209
- prefix = None
210
- for pfx in ["model.diffusion_model.", "model."]:
211
- if any([x.startswith(pfx) for x in state_dict.keys()]):
212
- prefix = pfx
213
- break
214
-
215
- # prefix for uniform state dict
216
- if prefix is None:
217
- for pfx in ["net."]:
218
- if all([x.startswith(pfx) for x in state_dict.keys()]):
219
- prefix = pfx
220
- break
221
-
222
- # strip prefix if found
223
- if prefix is not None:
224
- logging.info(f"State dict prefix found: '{prefix}'")
225
- sd = {}
226
- for k, v in state_dict.items():
227
- if prefix not in k:
228
- continue
229
- k = k.replace(prefix, "")
230
- sd[k] = v
231
- else:
232
- logging.debug("State dict has no prefix")
233
- sd = state_dict
234
-
235
- return sd
236
-
237
- def find_main_dtype(state_dict, allow_fp32=False):
238
- # detect most common dtype in input
239
- dtypes = [x.dtype for x in state_dict.values()]
240
- dtypes = {x:dtypes.count(x) for x in set(dtypes)}
241
- main_dtype = max(dtypes, key=dtypes.get)
242
-
243
- if main_dtype == torch.bfloat16:
244
- ftype_name = "BF16"
245
- ftype_gguf = gguf.LlamaFileType.MOSTLY_BF16
246
- elif main_dtype == torch.float32 and allow_fp32:
247
- ftype_name = "F32"
248
- ftype_gguf = gguf.LlamaFileType.ALL_F32
249
- else:
250
- ftype_name = "F16"
251
- ftype_gguf = gguf.LlamaFileType.MOSTLY_F16
252
-
253
- return ftype_name, ftype_gguf
254
-
255
- def load_state_dict(path):
256
- if any(path.endswith(x) for x in [".ckpt", ".pt", ".bin", ".pth"]):
257
- state_dict = torch.load(path, map_location="cpu", weights_only=True)
258
- for subkey in ["model", "module"]:
259
- if subkey in state_dict:
260
- state_dict = state_dict[subkey]
261
- break
262
- if len(state_dict) < 20:
263
- raise RuntimeError(f"pt subkey load failed: {state_dict.keys()}")
264
- else:
265
- state_dict = load_file(path)
266
-
267
- return strip_prefix(state_dict)
268
-
269
- def handle_tensors(writer, state_dict, model_arch, allow_fp32=False):
270
- name_lengths = tuple(sorted(
271
- ((key, len(key)) for key in state_dict.keys()),
272
- key=lambda item: item[1],
273
- reverse=True,
274
- ))
275
- if not name_lengths:
276
- return
277
- max_name_len = name_lengths[0][1]
278
-
279
- if max_name_len > MAX_TENSOR_NAME_LENGTH:
280
- bad_list = ", ".join(f"{key!r} ({namelen})" for key, namelen in name_lengths if namelen > MAX_TENSOR_NAME_LENGTH)
281
- raise ValueError(f"Can only handle tensor names up to {MAX_TENSOR_NAME_LENGTH} characters. Tensors exceeding the limit: {bad_list}")
282
-
283
- invalid_tensors = {}
284
- quantized_tensors = {}
285
- for key, data in tqdm(state_dict.items()):
286
- old_dtype = data.dtype
287
-
288
- if any(x in key for x in model_arch.keys_ignore):
289
- tqdm.write(f"Filtering ignored key: '{key}'")
290
- continue
291
-
292
- if data.dtype == torch.bfloat16:
293
- data = data.to(torch.float32).numpy()
294
- # this is so we don't break torch 2.0.X
295
- elif data.dtype in [getattr(torch, "float8_e4m3fn", "_invalid"), getattr(torch, "float8_e5m2", "_invalid")]:
296
- data = data.to(torch.float16).numpy()
297
- else:
298
- data = data.numpy()
299
-
300
- n_dims = len(data.shape)
301
- data_shape = data.shape
302
- if old_dtype == torch.bfloat16:
303
- data_qtype = gguf.GGMLQuantizationType.BF16
304
- elif old_dtype == torch.float32 and allow_fp32:
305
- data_qtype = gguf.GGMLQuantizationType.F32
306
- else:
307
- data_qtype = gguf.GGMLQuantizationType.F16
308
-
309
- # The max no. of dimensions that can be handled by the quantization code is 4
310
- if len(data.shape) > MAX_TENSOR_DIMS:
311
- invalid_tensors[key] = data
312
- continue # needs to be added back later
313
-
314
- # get number of parameters (AKA elements) in this tensor
315
- n_params = 1
316
- for dim_size in data_shape:
317
- n_params *= dim_size
318
-
319
- if old_dtype in (torch.float32, torch.bfloat16):
320
- if n_dims == 1:
321
- # one-dimensional tensors should be kept in F32
322
- # also speeds up inference due to not dequantizing
323
- data_qtype = gguf.GGMLQuantizationType.F32
324
-
325
- elif n_params <= QUANTIZATION_THRESHOLD:
326
- # very small tensors
327
- data_qtype = gguf.GGMLQuantizationType.F32
328
-
329
- elif any(x in key for x in model_arch.keys_hiprec):
330
- # tensors that require max precision
331
- data_qtype = gguf.GGMLQuantizationType.F32
332
-
333
- if (model_arch.shape_fix # NEVER reshape for models such as flux
334
- and n_dims > 1 # Skip one-dimensional tensors
335
- and n_params >= REARRANGE_THRESHOLD # Only rearrange tensors meeting the size requirement
336
- and (n_params / 256).is_integer() # Rearranging only makes sense if total elements is divisible by 256
337
- and not (data.shape[-1] / 256).is_integer() # Only need to rearrange if the last dimension is not divisible by 256
338
- ):
339
- orig_shape = data.shape
340
- data = data.reshape(n_params // 256, 256)
341
- writer.add_array(f"comfy.gguf.orig_shape.{key}", tuple(int(dim) for dim in orig_shape))
342
-
343
- try:
344
- data = gguf.quants.quantize(data, data_qtype)
345
- quantized_tensors[key] = data_qtype
346
- except (AttributeError, gguf.QuantError) as e:
347
- tqdm.write(f"falling back to F16: {e}")
348
- data_qtype = gguf.GGMLQuantizationType.F16
349
- data = gguf.quants.quantize(data, data_qtype)
350
- quantized_tensors[key] = data_qtype
351
-
352
- shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
353
- tqdm.write(f"{f'%-{max_name_len + 4}s' % f'{key}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
354
-
355
- writer.add_tensor(key, data, raw_dtype=data_qtype)
356
-
357
- return quantized_tensors, invalid_tensors
358
-
359
- def convert_file(path, dst_path=None, interact=True, overwrite=False, allow_fp32=False):
360
- # load & run model detection logic
361
- state_dict = load_state_dict(path)
362
- model_arch = detect_arch(state_dict)
363
- logging.info(f"* Architecture detected from input: {model_arch.arch}")
364
-
365
- ftype_name, ftype_gguf = find_main_dtype(state_dict, allow_fp32=allow_fp32)
366
-
367
- if dst_path is None:
368
- dst_path = f"{os.path.splitext(path)[0]}-{ftype_name}.gguf"
369
- elif "{ftype}" in dst_path: # lcpp logic
370
- dst_path = dst_path.replace("{ftype}", ftype_name)
371
-
372
- if os.path.isfile(dst_path) and not overwrite:
373
- if interact:
374
- input("Output exists enter to continue or ctrl+c to abort!")
375
- else:
376
- raise OSError("Output exists and overwriting is disabled!")
377
-
378
- # handle actual file
379
- writer = gguf.GGUFWriter(path=None, arch=model_arch.arch)
380
- writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
381
- if ftype_gguf is not None:
382
- writer.add_file_type(ftype_gguf)
383
-
384
- quantized_tensors, invalid_tensors = handle_tensors(writer, state_dict, model_arch, allow_fp32=allow_fp32)
385
- if len(invalid_tensors) > 0:
386
- if not model_arch.ndims_fix: # only applies to 5D fix for now, possibly expand to cover more cases?
387
- raise ValueError(f"Tensor(s) detected that exceeds dims supported by C++ code! ({invalid_tensors.keys()})")
388
-
389
- fix_path = os.path.join(
390
- os.path.dirname(dst_path),
391
- f"fix_5d_tensors_{model_arch.arch}.safetensors"
392
- )
393
- if os.path.isfile(fix_path):
394
- raise RuntimeError(f"Tensor fix file already exists! {path}")
395
-
396
- invalid_tensors = {k:torch.from_numpy(v.copy()) for k,v in invalid_tensors.items()}
397
- save_file(invalid_tensors, fix_path)
398
- logging.warning(f"\n### Warning! Fix file found at '{fix_path}'")
399
- logging.warning(" you most likely need to run 'fix_5d_tensors.py' after quantization.")
400
- else:
401
- fix_path = None
402
-
403
- writer.write_header_to_file(path=dst_path)
404
- writer.write_kv_data_to_file()
405
- writer.write_tensors_to_file(progress=True)
406
- writer.close()
407
-
408
- return dst_path, model_arch, fix_path
409
-
410
- if __name__ == "__main__":
411
- args = parse_args()
412
- convert_file(args.src, args.dst)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/fix_5d_tensors.py DELETED
@@ -1,85 +0,0 @@
1
- # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
2
- import os
3
- import gguf
4
- import torch
5
- import argparse
6
- from tqdm import tqdm
7
- from safetensors.torch import load_file
8
-
9
- def get_args():
10
- parser = argparse.ArgumentParser()
11
- parser.add_argument("--src", required=True)
12
- parser.add_argument("--dst", required=True)
13
- parser.add_argument("--fix", required=False, help="Defaults to ./fix_5d_tensors_[arch].pt")
14
- parser.add_argument("--overwrite", action="store_true")
15
- args = parser.parse_args()
16
-
17
- if not os.path.isfile(args.src):
18
- parser.error(f"Invalid source file '{args.src}'")
19
- if not args.overwrite and os.path.exists(args.dst):
20
- parser.error(f"Output exists, use '--overwrite' ({args.dst})")
21
-
22
- return args
23
-
24
- def get_arch_str(reader):
25
- field = reader.get_field("general.architecture")
26
- return str(field.parts[field.data[-1]], encoding="utf-8")
27
-
28
- def get_file_type(reader):
29
- field = reader.get_field("general.file_type")
30
- ft = int(field.parts[field.data[-1]])
31
- return gguf.LlamaFileType(ft)
32
-
33
- def apply_5d_fix(src, dst, fix=None, overwrite=False):
34
- # read existing
35
- reader = gguf.GGUFReader(src)
36
- arch = get_arch_str(reader)
37
- file_type = get_file_type(reader)
38
- print(f"Detected arch: '{arch}' (ftype: {str(file_type)})")
39
-
40
- # prep fix
41
- if fix is None:
42
- fix = f"./fix_5d_tensors_{arch}.safetensors"
43
-
44
- if not os.path.isfile(fix):
45
- raise OSError(f"No 5D tensor fix file: {fix}")
46
-
47
- sd5d = load_file(fix)
48
- sd5d = {k:v.numpy() for k,v in sd5d.items()}
49
- print("5D tensors:", sd5d.keys())
50
-
51
- # prep output
52
- writer = gguf.GGUFWriter(path=None, arch=arch)
53
- writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
54
- writer.add_file_type(file_type)
55
-
56
- global added
57
- added = []
58
- def add_extra_key(writer, key, data):
59
- global added
60
- data_qtype = gguf.GGMLQuantizationType.F32
61
- data = gguf.quants.quantize(data, data_qtype)
62
- tqdm.write(f"Adding key {key} ({data.shape})")
63
- writer.add_tensor(key, data, raw_dtype=data_qtype)
64
- added.append(key)
65
-
66
- # main loop to add missing 5D tensor(s)
67
- for tensor in tqdm(reader.tensors):
68
- writer.add_tensor(tensor.name, tensor.data, raw_dtype=tensor.tensor_type)
69
- key5d = tensor.name.replace(".bias", ".weight")
70
- if key5d in sd5d.keys():
71
- add_extra_key(writer, key5d, sd5d[key5d])
72
-
73
- # brute force for any missed
74
- for key, data in sd5d.items():
75
- if key not in added:
76
- add_extra_key(writer, key, data)
77
-
78
- writer.write_header_to_file(path=dst)
79
- writer.write_kv_data_to_file()
80
- writer.write_tensors_to_file(progress=True)
81
- writer.close()
82
-
83
- if __name__ == "__main__":
84
- args = get_args()
85
- apply_5d_fix(args.src, args.dst, fix=args.fix, overwrite=args.overwrite)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/fix_lines_ending.py DELETED
@@ -1,31 +0,0 @@
1
- import os
2
-
3
- files = ["lcpp.patch", "lcpp_sd3.patch"]
4
-
5
- def has_unix_line_endings(file_path):
6
- try:
7
- with open(file_path, 'rb') as file:
8
- content = file.read()
9
- return b'\r\n' not in content
10
- except Exception as e:
11
- print(f"Error checking '{file_path}': {e}")
12
- return False
13
-
14
- def convert_to_linux_format(file_path):
15
- try:
16
- with open(file_path, 'rb') as file:
17
- content = file.read().replace(b'\r\n', b'\n')
18
- with open(file_path, 'wb') as file:
19
- file.write(content)
20
- print(f"'{file_path}' converted to Linux line endings (LF).")
21
- except Exception as e:
22
- print(f"Error processing '{file_path}': {e}")
23
-
24
- for file in files:
25
- if os.path.exists(file):
26
- if has_unix_line_endings(file):
27
- print(f"'{file}' already has Unix line endings (LF). No conversion needed.")
28
- else:
29
- convert_to_linux_format(file)
30
- else:
31
- print(f"File '{file}' does not exist.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/lcpp.patch DELETED
@@ -1,499 +0,0 @@
1
- diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
2
- index de3c706f..0267c1fa 100644
3
- --- a/ggml/include/ggml.h
4
- +++ b/ggml/include/ggml.h
5
- @@ -223,7 +223,7 @@
6
- #define GGML_MAX_OP_PARAMS 64
7
-
8
- #ifndef GGML_MAX_NAME
9
- -# define GGML_MAX_NAME 64
10
- +# define GGML_MAX_NAME 128
11
- #endif
12
-
13
- #define GGML_DEFAULT_N_THREADS 4
14
- @@ -2449,6 +2449,7 @@ extern "C" {
15
-
16
- // manage tensor info
17
- GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
18
- + GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
19
- GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
20
- GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
21
-
22
- diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
23
- index b16c462f..6d1568f1 100644
24
- --- a/ggml/src/ggml.c
25
- +++ b/ggml/src/ggml.c
26
- @@ -22960,6 +22960,14 @@ void gguf_add_tensor(
27
- ctx->header.n_tensors++;
28
- }
29
-
30
- +void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
31
- + const int idx = gguf_find_tensor(ctx, name);
32
- + if (idx < 0) {
33
- + GGML_ABORT("tensor not found");
34
- + }
35
- + ctx->infos[idx].n_dims = n_dim;
36
- +}
37
- +
38
- void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
39
- const int idx = gguf_find_tensor(ctx, name);
40
- if (idx < 0) {
41
- diff --git a/src/llama.cpp b/src/llama.cpp
42
- index 24e1f1f0..8a1e9ef8 100644
43
- --- a/src/llama.cpp
44
- +++ b/src/llama.cpp
45
- @@ -205,6 +205,18 @@ enum llm_arch {
46
- LLM_ARCH_GRANITE,
47
- LLM_ARCH_GRANITE_MOE,
48
- LLM_ARCH_CHAMELEON,
49
- + LLM_ARCH_FLUX,
50
- + LLM_ARCH_SD1,
51
- + LLM_ARCH_SDXL,
52
- + LLM_ARCH_SD3,
53
- + LLM_ARCH_AURA,
54
- + LLM_ARCH_LTXV,
55
- + LLM_ARCH_HYVID,
56
- + LLM_ARCH_WAN,
57
- + LLM_ARCH_HIDREAM,
58
- + LLM_ARCH_COSMOS,
59
- + LLM_ARCH_LUMINA2,
60
- + LLM_ARCH_QWEN_IMAGE,
61
- LLM_ARCH_UNKNOWN,
62
- };
63
-
64
- @@ -258,6 +270,18 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
65
- { LLM_ARCH_GRANITE, "granite" },
66
- { LLM_ARCH_GRANITE_MOE, "granitemoe" },
67
- { LLM_ARCH_CHAMELEON, "chameleon" },
68
- + { LLM_ARCH_FLUX, "flux" },
69
- + { LLM_ARCH_SD1, "sd1" },
70
- + { LLM_ARCH_SDXL, "sdxl" },
71
- + { LLM_ARCH_SD3, "sd3" },
72
- + { LLM_ARCH_AURA, "aura" },
73
- + { LLM_ARCH_LTXV, "ltxv" },
74
- + { LLM_ARCH_HYVID, "hyvid" },
75
- + { LLM_ARCH_WAN, "wan" },
76
- + { LLM_ARCH_HIDREAM, "hidream" },
77
- + { LLM_ARCH_COSMOS, "cosmos" },
78
- + { LLM_ARCH_LUMINA2, "lumina2" },
79
- + { LLM_ARCH_QWEN_IMAGE, "qwen_image" },
80
- { LLM_ARCH_UNKNOWN, "(unknown)" },
81
- };
82
-
83
- @@ -1531,6 +1555,18 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
84
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
85
- },
86
- },
87
- + { LLM_ARCH_FLUX, {}},
88
- + { LLM_ARCH_SD1, {}},
89
- + { LLM_ARCH_SDXL, {}},
90
- + { LLM_ARCH_SD3, {}},
91
- + { LLM_ARCH_AURA, {}},
92
- + { LLM_ARCH_LTXV, {}},
93
- + { LLM_ARCH_HYVID, {}},
94
- + { LLM_ARCH_WAN, {}},
95
- + { LLM_ARCH_HIDREAM, {}},
96
- + { LLM_ARCH_COSMOS, {}},
97
- + { LLM_ARCH_LUMINA2, {}},
98
- + { LLM_ARCH_QWEN_IMAGE, {}},
99
- {
100
- LLM_ARCH_UNKNOWN,
101
- {
102
- @@ -5403,6 +5439,26 @@ static void llm_load_hparams(
103
- // get general kv
104
- ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
105
-
106
- + // Disable LLM metadata for image models
107
- + switch (model.arch) {
108
- + case LLM_ARCH_FLUX:
109
- + case LLM_ARCH_SD1:
110
- + case LLM_ARCH_SDXL:
111
- + case LLM_ARCH_SD3:
112
- + case LLM_ARCH_AURA:
113
- + case LLM_ARCH_LTXV:
114
- + case LLM_ARCH_HYVID:
115
- + case LLM_ARCH_WAN:
116
- + case LLM_ARCH_HIDREAM:
117
- + case LLM_ARCH_COSMOS:
118
- + case LLM_ARCH_LUMINA2:
119
- + case LLM_ARCH_QWEN_IMAGE:
120
- + model.ftype = ml.ftype;
121
- + return;
122
- + default:
123
- + break;
124
- + }
125
- +
126
- // get hparams kv
127
- ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
128
-
129
- @@ -18016,6 +18072,158 @@ static void llama_tensor_dequantize_internal(
130
- workers.clear();
131
- }
132
-
133
- +static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
134
- + // Special function for quantizing image model tensors
135
- + const std::string name = ggml_get_name(tensor);
136
- + const llm_arch arch = qs.model.arch;
137
- +
138
- + // Sanity check
139
- + if (
140
- + (name.find("model.diffusion_model.") != std::string::npos) ||
141
- + (name.find("first_stage_model.") != std::string::npos) ||
142
- + (name.find("single_transformer_blocks.") != std::string::npos) ||
143
- + (name.find("joint_transformer_blocks.") != std::string::npos)
144
- + ) {
145
- + throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
146
- + }
147
- +
148
- + // Unsupported quant types - exclude all IQ quants for now
149
- + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
150
- + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
151
- + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
152
- + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
153
- + ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
154
- + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
155
- + ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
156
- + throw std::runtime_error("Invalid quantization type for image model (Not supported)");
157
- + }
158
- +
159
- + if ( // Rules for to_v attention
160
- + (name.find("attn_v.weight") != std::string::npos) ||
161
- + (name.find(".to_v.weight") != std::string::npos) ||
162
- + (name.find(".v.weight") != std::string::npos) ||
163
- + (name.find(".attn.w1v.weight") != std::string::npos) ||
164
- + (name.find(".attn.w2v.weight") != std::string::npos) ||
165
- + (name.find(".add_v_proj.weight") != std::string::npos) ||
166
- + (name.find("_attn.v_proj.weight") != std::string::npos)
167
- + ){
168
- + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
169
- + new_type = GGML_TYPE_Q3_K;
170
- + }
171
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
172
- + new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
173
- + }
174
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
175
- + new_type = GGML_TYPE_Q5_K;
176
- + }
177
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
178
- + new_type = GGML_TYPE_Q6_K;
179
- + }
180
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
181
- + new_type = GGML_TYPE_Q5_K;
182
- + }
183
- + ++qs.i_attention_wv;
184
- + } else if ( // Rules for fused qkv attention
185
- + (name.find("attn_qkv.weight") != std::string::npos) ||
186
- + (name.find("attn.qkv.weight") != std::string::npos) ||
187
- + (name.find("attention.qkv.weight") != std::string::npos)
188
- + ) {
189
- + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
190
- + new_type = GGML_TYPE_Q4_K;
191
- + }
192
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
193
- + new_type = GGML_TYPE_Q5_K;
194
- + }
195
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
196
- + new_type = GGML_TYPE_Q6_K;
197
- + }
198
- + } else if ( // Rules for ffn
199
- + (name.find("ffn_down") != std::string::npos) ||
200
- + ((name.find("experts.") != std::string::npos) && (name.find(".w2.weight") != std::string::npos)) ||
201
- + (name.find(".ffn.2.weight") != std::string::npos) || // is this even the right way around?
202
- + (name.find(".ff.net.2.weight") != std::string::npos) ||
203
- + (name.find(".mlp.layer2.weight") != std::string::npos) ||
204
- + (name.find(".adaln_modulation_mlp.2.weight") != std::string::npos) ||
205
- + (name.find(".feed_forward.w2.weight") != std::string::npos) ||
206
- + (name.find(".img_mlp.net.2.weight") != std::string::npos) ||
207
- + (name.find(".txt_mlp.net.2.weight") != std::string::npos)
208
- + ) {
209
- + // TODO: add back `layer_info` with some model specific logic + logic further down
210
- + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
211
- + new_type = GGML_TYPE_Q4_K;
212
- + }
213
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
214
- + new_type = GGML_TYPE_Q5_K;
215
- + }
216
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
217
- + new_type = GGML_TYPE_Q5_K;
218
- + }
219
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
220
- + new_type = GGML_TYPE_Q6_K;
221
- + }
222
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
223
- + new_type = GGML_TYPE_Q6_K;
224
- + }
225
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
226
- + new_type = GGML_TYPE_Q4_1;
227
- + }
228
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
229
- + new_type = GGML_TYPE_Q5_1;
230
- + }
231
- + ++qs.i_ffn_down;
232
- + }
233
- +
234
- + // first/last block high precision test
235
- + if (arch == LLM_ARCH_QWEN_IMAGE){
236
- + if (
237
- + (name.find("transformer_blocks.0.") != std::string::npos) ||
238
- + (name.find("transformer_blocks.59.") != std::string::npos) // this should be dynamic
239
- + ) {
240
- + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
241
- + new_type = GGML_TYPE_Q4_K;
242
- + }
243
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
244
- + new_type = GGML_TYPE_Q4_K;
245
- + }
246
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
247
- + new_type = GGML_TYPE_Q5_K;
248
- + }
249
- + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
250
- + new_type = GGML_TYPE_Q6_K;
251
- + }
252
- + }
253
- + }
254
- +
255
- + // Sanity check for row shape
256
- + bool convert_incompatible_tensor = false;
257
- + if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
258
- + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
259
- + int nx = tensor->ne[0];
260
- + int ny = tensor->ne[1];
261
- + if (nx % QK_K != 0) {
262
- + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
263
- + convert_incompatible_tensor = true;
264
- + } else {
265
- + ++qs.n_k_quantized;
266
- + }
267
- + }
268
- + if (convert_incompatible_tensor) {
269
- + // TODO: Possibly reenable this in the future
270
- + // switch (new_type) {
271
- + // case GGML_TYPE_Q2_K:
272
- + // case GGML_TYPE_Q3_K:
273
- + // case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
274
- + // case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
275
- + // case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
276
- + // default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
277
- + // }
278
- + new_type = GGML_TYPE_F16;
279
- + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
280
- + ++qs.n_fallback;
281
- + }
282
- + return new_type;
283
- +}
284
- +
285
- static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
286
- const std::string name = ggml_get_name(tensor);
287
-
288
- @@ -18513,7 +18721,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
289
- if (llama_model_has_encoder(&model)) {
290
- n_attn_layer *= 3;
291
- }
292
- - GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
293
- + if (model.arch != LLM_ARCH_HYVID) { // TODO: Check why this fails
294
- + GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
295
- + }
296
- }
297
-
298
- size_t total_size_org = 0;
299
- @@ -18547,6 +18757,57 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
300
- ctx_outs[i_split] = gguf_init_empty();
301
- }
302
- gguf_add_tensor(ctx_outs[i_split], tensor);
303
- + // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
304
- + if (model.arch == LLM_ARCH_SD3) {
305
- + const std::string name = ggml_get_name(tensor);
306
- + if (name == "pos_embed" && tensor->ne[2] == 1) {
307
- + const int n_dim = 3;
308
- + gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
309
- + LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
310
- + }
311
- + }
312
- + // same goes for auraflow
313
- + if (model.arch == LLM_ARCH_AURA) {
314
- + const std::string name = ggml_get_name(tensor);
315
- + if (name == "positional_encoding" && tensor->ne[2] == 1) {
316
- + const int n_dim = 3;
317
- + gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
318
- + LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
319
- + }
320
- + if (name == "register_tokens" && tensor->ne[2] == 1) {
321
- + const int n_dim = 3;
322
- + gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
323
- + LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
324
- + }
325
- + }
326
- + // conv3d fails due to max dims - unsure what to do here as we never even reach this check
327
- + if (model.arch == LLM_ARCH_HYVID) {
328
- + const std::string name = ggml_get_name(tensor);
329
- + if (name == "img_in.proj.weight" && tensor->ne[5] != 1 ) {
330
- + throw std::runtime_error("img_in.proj.weight size failed for HyVid");
331
- + }
332
- + }
333
- + // All the modulation layers also have dim1, and I think conv3d fails here too but we segfaul way before that...
334
- + if (model.arch == LLM_ARCH_WAN) {
335
- + const std::string name = ggml_get_name(tensor);
336
- + if (name.find(".modulation") != std::string::npos && tensor->ne[2] == 1) {
337
- + const int n_dim = 3;
338
- + gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
339
- + LLAMA_LOG_INFO("\n%s: Correcting shape for Wan: [key:%s]\n", __func__, tensor->name);
340
- + }
341
- + // FLF2V model only
342
- + if (name == "img_emb.emb_pos") {
343
- + const int n_dim = 3;
344
- + gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
345
- + LLAMA_LOG_INFO("\n%s: Correcting shape for Wan FLF2V: [key:%s]\n", __func__, tensor->name);
346
- + }
347
- + // S2V model only
348
- + if (name == "casual_audio_encoder.weights" || name == "casual_audio_encoder.encoder.padding_tokens") {
349
- + const int n_dim = 4;
350
- + gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
351
- + LLAMA_LOG_INFO("\n%s: Correcting shape for Wan S2V: [key:%s]\n", __func__, tensor->name);
352
- + }
353
- + }
354
- }
355
-
356
- // Set split info if needed
357
- @@ -18647,6 +18908,124 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
358
- // do not quantize relative position bias (T5)
359
- quantize &= name.find("attn_rel_b.weight") == std::string::npos;
360
-
361
- + // rules for image models
362
- + bool image_model = false;
363
- + if (model.arch == LLM_ARCH_FLUX) {
364
- + image_model = true;
365
- + quantize &= name.find("txt_in.") == std::string::npos;
366
- + quantize &= name.find("img_in.") == std::string::npos;
367
- + quantize &= name.find("time_in.") == std::string::npos;
368
- + quantize &= name.find("vector_in.") == std::string::npos;
369
- + quantize &= name.find("guidance_in.") == std::string::npos;
370
- + quantize &= name.find("final_layer.") == std::string::npos;
371
- + }
372
- + if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
373
- + image_model = true;
374
- + quantize &= name.find("class_embedding.") == std::string::npos;
375
- + quantize &= name.find("time_embedding.") == std::string::npos;
376
- + quantize &= name.find("add_embedding.") == std::string::npos;
377
- + quantize &= name.find("time_embed.") == std::string::npos;
378
- + quantize &= name.find("label_emb.") == std::string::npos;
379
- + quantize &= name.find("conv_in.") == std::string::npos;
380
- + quantize &= name.find("conv_out.") == std::string::npos;
381
- + quantize &= name != "input_blocks.0.0.weight";
382
- + quantize &= name != "out.2.weight";
383
- + }
384
- + if (model.arch == LLM_ARCH_SD3) {
385
- + image_model = true;
386
- + quantize &= name.find("final_layer.") == std::string::npos;
387
- + quantize &= name.find("time_text_embed.") == std::string::npos;
388
- + quantize &= name.find("context_embedder.") == std::string::npos;
389
- + quantize &= name.find("t_embedder.") == std::string::npos;
390
- + quantize &= name.find("y_embedder.") == std::string::npos;
391
- + quantize &= name.find("x_embedder.") == std::string::npos;
392
- + quantize &= name != "proj_out.weight";
393
- + quantize &= name != "pos_embed";
394
- + }
395
- + if (model.arch == LLM_ARCH_AURA) {
396
- + image_model = true;
397
- + quantize &= name.find("t_embedder.") == std::string::npos;
398
- + quantize &= name.find("init_x_linear.") == std::string::npos;
399
- + quantize &= name != "modF.1.weight";
400
- + quantize &= name != "cond_seq_linear.weight";
401
- + quantize &= name != "final_linear.weight";
402
- + quantize &= name != "final_linear.weight";
403
- + quantize &= name != "positional_encoding";
404
- + quantize &= name != "register_tokens";
405
- + }
406
- + if (model.arch == LLM_ARCH_LTXV) {
407
- + image_model = true;
408
- + quantize &= name.find("adaln_single.") == std::string::npos;
409
- + quantize &= name.find("caption_projection.") == std::string::npos;
410
- + quantize &= name.find("patchify_proj.") == std::string::npos;
411
- + quantize &= name.find("proj_out.") == std::string::npos;
412
- + quantize &= name.find("scale_shift_table") == std::string::npos; // last block too
413
- + }
414
- + if (model.arch == LLM_ARCH_HYVID) {
415
- + image_model = true;
416
- + quantize &= name.find("txt_in.") == std::string::npos;
417
- + quantize &= name.find("img_in.") == std::string::npos;
418
- + quantize &= name.find("time_in.") == std::string::npos;
419
- + quantize &= name.find("vector_in.") == std::string::npos;
420
- + quantize &= name.find("guidance_in.") == std::string::npos;
421
- + quantize &= name.find("final_layer.") == std::string::npos;
422
- + }
423
- + if (model.arch == LLM_ARCH_WAN) {
424
- + image_model = true;
425
- + quantize &= name.find("modulation.") == std::string::npos;
426
- + quantize &= name.find("patch_embedding.") == std::string::npos;
427
- + quantize &= name.find("text_embedding.") == std::string::npos;
428
- + quantize &= name.find("time_projection.") == std::string::npos;
429
- + quantize &= name.find("time_embedding.") == std::string::npos;
430
- + quantize &= name.find("img_emb.") == std::string::npos;
431
- + quantize &= name.find("head.") == std::string::npos;
432
- + // S2V
433
- + quantize &= name.find("cond_encoder.") == std::string::npos;
434
- + quantize &= name.find("frame_packer.") == std::string::npos;
435
- + quantize &= name.find("audio_injector.") == std::string::npos;
436
- + quantize &= name.find("casual_audio_encoder.") == std::string::npos;
437
- + quantize &= name.find("trainable_cond_mask.") == std::string::npos;
438
- + }
439
- + if (model.arch == LLM_ARCH_HIDREAM) {
440
- + image_model = true;
441
- + quantize &= name.find("p_embedder.") == std::string::npos;
442
- + quantize &= name.find("t_embedder.") == std::string::npos;
443
- + quantize &= name.find("x_embedder.") == std::string::npos;
444
- + quantize &= name.find("final_layer.") == std::string::npos;
445
- + quantize &= name.find(".ff_i.gate.weight") == std::string::npos;
446
- + quantize &= name.find("caption_projection.") == std::string::npos;
447
- + }
448
- + if (model.arch == LLM_ARCH_COSMOS) {
449
- + image_model = true;
450
- + quantize &= name.find("p_embedder.") == std::string::npos;
451
- + quantize &= name.find("t_embedder.") == std::string::npos;
452
- + quantize &= name.find("t_embedding_norm.") == std::string::npos;
453
- + quantize &= name.find("x_embedder.") == std::string::npos;
454
- + quantize &= name.find("pos_embedder.") == std::string::npos;
455
- + quantize &= name.find("final_layer.") == std::string::npos;
456
- + }
457
- + if (model.arch == LLM_ARCH_LUMINA2) {
458
- + image_model = true;
459
- + quantize &= name.find("t_embedder.") == std::string::npos;
460
- + quantize &= name.find("x_embedder.") == std::string::npos;
461
- + quantize &= name.find("final_layer.") == std::string::npos;
462
- + quantize &= name.find("cap_embedder.") == std::string::npos;
463
- + quantize &= name.find("context_refiner.") == std::string::npos;
464
- + quantize &= name.find("noise_refiner.") == std::string::npos;
465
- + }
466
- + if (model.arch == LLM_ARCH_QWEN_IMAGE) {
467
- + image_model = true;
468
- + quantize &= name.find("img_in.") == std::string::npos;
469
- + quantize &= name.find("txt_in.") == std::string::npos;
470
- + quantize &= name.find("time_text_embed.") == std::string::npos;
471
- + quantize &= name.find("proj_out.") == std::string::npos;
472
- + quantize &= name.find("norm_out.") == std::string::npos;
473
- + }
474
- + // ignore 3D/4D tensors for image models as the code was never meant to handle these
475
- + if (image_model) {
476
- + quantize &= ggml_n_dims(tensor) == 2;
477
- + }
478
- +
479
- enum ggml_type new_type;
480
- void * new_data;
481
- size_t new_size;
482
- @@ -18655,6 +19034,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
483
- new_type = default_type;
484
-
485
- // get more optimal quantization type based on the tensor shape, layer, etc.
486
- + if (image_model) {
487
- + new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
488
- + } else {
489
- if (!params->pure && ggml_is_quantized(default_type)) {
490
- new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
491
- }
492
- @@ -18664,6 +19046,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
493
- if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
494
- new_type = params->output_tensor_type;
495
- }
496
- + }
497
-
498
- // If we've decided to quantize to the same type the tensor is already
499
- // in then there's nothing to do.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/read_tensors.py DELETED
@@ -1,21 +0,0 @@
1
- #!/usr/bin/python3
2
- import os
3
- import sys
4
- import gguf
5
-
6
- def read_tensors(path):
7
- reader = gguf.GGUFReader(path)
8
- for tensor in reader.tensors:
9
- if tensor.tensor_type == gguf.GGMLQuantizationType.F32:
10
- continue
11
- print(f"{str(tensor.tensor_type):32}: {tensor.name}")
12
-
13
- try:
14
- path = sys.argv[1]
15
- assert os.path.isfile(path), "Invalid path"
16
- print(f"input: {path}")
17
- except Exception as e:
18
- input(f"failed: {e}")
19
- else:
20
- read_tensors(path)
21
- input()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/tool_auto.py DELETED
@@ -1,374 +0,0 @@
1
- # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
2
- import os
3
- import re
4
- import sys
5
- import time
6
- import torch
7
- import logging
8
- import argparse
9
- import subprocess
10
- import huggingface_hub as hf
11
-
12
- logging.getLogger().setLevel(logging.DEBUG)
13
-
14
- qtypes =[
15
- # "F16", "BF16",
16
- "Q8_0", "Q6_K",
17
- "Q5_K_M", "Q5_K_S", "Q5_1", "Q5_0",
18
- "Q4_K_M", "Q4_K_S", "Q4_1", "Q4_0",
19
- "Q3_K_M", "Q3_K_S", "Q2_K"
20
- ]
21
-
22
- dtype_dict = {
23
- "F32": torch.float32,
24
- "F16": torch.float16,
25
- "BF16": torch.bfloat16,
26
- "F8_E4M3": getattr(torch, "float8_e4m3fn", "_invalid"),
27
- "F8_E5M2": getattr(torch, "float8_e5m2", "_invalid"),
28
- }
29
-
30
- # this is pretty jank but I want to be able to run it on a blank instance w/o setup
31
- terraform_dict = {
32
- "repo": "city96/ComfyUI-GGUF",
33
- "target": "auto_convert",
34
- "lcpp_repo": "ggerganov/llama.cpp",
35
- "lcpp_target": "tags/b3962",
36
- }
37
-
38
- def get_args():
39
- parser = argparse.ArgumentParser()
40
- parser.add_argument("--src", required=True, help="Source model file or huggingface repo name")
41
- parser.add_argument("--quants", nargs="+", choices=["all", "base", *qtypes], default=["Q8_0"])
42
- parser.add_argument("--output-dir", default=None, help="Location for output files, defaults to current dir or ComfyUI model dir.")
43
- parser.add_argument("--temp-dir", default=None, help="Location for temp files, defaults to [output_dir]/tmp")
44
- parser.add_argument("--force-update", action="store_true", help="Force update & rebuild entire quantization stack.")
45
- parser.add_argument("--resume", action="store_true", help="Skip over existing files. Will NOT check for broken/interrupted files.")
46
-
47
- args = parser.parse_args()
48
- if args.output_dir is None:
49
- args.output_dir = get_output_dir()
50
- if args.temp_dir is None:
51
- args.temp_dir = os.path.join(args.output_dir, "tmp")
52
-
53
- if os.path.isdir(args.temp_dir) and len(os.listdir(args.temp_dir)) > 0:
54
- raise OSError("Output temp folder not empty!")
55
-
56
- if "all" in args.quants:
57
- args.quants = ["base", *qtypes]
58
-
59
- return args
60
-
61
- def run_cmd(*args, log_error=False):
62
- logging.debug(f"cmd: {args}")
63
- try:
64
- log = subprocess.run(args, capture_output=True, text=True)
65
- except Exception as e:
66
- logging.warning(f"{args[0]}, {e}")
67
- return -1
68
- if log.returncode != 0 and log_error:
69
- logging.warning(f"{args[0]}: {log.stdout} {log.stderr}")
70
- else:
71
- logging.debug(f"{args[0]}: {repr(log.stdout)} {repr(log.stderr.strip())} RET:{log.returncode}")
72
- return log.returncode
73
-
74
- def setup_utils(force_update=False):
75
- # get ComfyUI-GGUF if missing, then compile patched llama.cpp if required
76
- root = os.path.dirname(os.path.abspath(__file__))
77
- root = os.path.normpath(root)
78
-
79
- if os.path.split(root)[1] != "tools":
80
- cg_dir = os.path.join(root, "ComfyUI-GGUF")
81
- if not os.path.isdir(cg_dir):
82
- logging.warning(f"Running outside tools folder! Cloning to {cg_dir}")
83
- run_cmd("git", "clone", f"https://github.com/{terraform_dict['repo']}", cg_dir)
84
- need_update = True
85
- else:
86
- need_update = False
87
-
88
- if force_update or need_update:
89
- if terraform_dict['target']:
90
- logging.info(f"Attemtping to check out ComfyUI-GGUF branch {terraform_dict['target']}")
91
- run_cmd("git", "-C", cg_dir, "checkout", terraform_dict['target'])
92
-
93
- logging.info("Attemtping to git pull ComfyUI-GGUF to latest")
94
- run_cmd("git", "-C", cg_dir, "pull")
95
-
96
- tools_dir = os.path.join(root, "ComfyUI-GGUF", "tools")
97
- sys.path.append(tools_dir) # to make import(s) work
98
- else:
99
- # TODO: Git pull here too?
100
- logging.warning(f"Assuming latest ComfyUI-GGUF. Please git pull & check out branch {terraform_dict['target']} manually!")
101
- tools_dir = root
102
-
103
- if not os.path.isdir(tools_dir):
104
- raise OSError(f"Can't find tools subfoder in ComfyUI-GGUF at {tools_dir}")
105
-
106
- convert_path = os.path.join(tools_dir, "convert.py")
107
- if not os.path.isfile(convert_path):
108
- raise OSError(f"Cannot find convert.py at location: {convert_path}")
109
-
110
- lcpp_path = os.path.join(root, "llama.cpp.auto") # avoid messing with regular dir
111
- if not os.path.isdir(lcpp_path):
112
- logging.info(f"Attemtping to clone llama.cpp repo to {lcpp_path}")
113
- run_cmd("git", "clone", f"https://github.com/{terraform_dict['lcpp_repo']}", lcpp_path)
114
- need_update = True
115
- else:
116
- need_update = False
117
-
118
- if force_update or need_update:
119
- # TODO: check reflog and/or git reset before checkout?
120
- logging.info(f"Attemtping to check out llama.cpp target {terraform_dict['lcpp_target']}")
121
- run_cmd("git", "-C", lcpp_path, "checkout", terraform_dict['lcpp_target'])
122
-
123
- # TODO: git reset before patch?
124
- patch_path = os.path.join(tools_dir, "lcpp.patch")
125
- # patch (probably) has wrong file endings:
126
- logging.info("Converting patch file endings")
127
- with open(patch_path, "rb") as file:
128
- content = file.read().replace(b"\r\n", b"\n")
129
- with open(patch_path, "wb") as file:
130
- file.write(content)
131
-
132
- if run_cmd("git", "-C", lcpp_path, "apply", "--check", "-R", patch_path) != 0:
133
- logging.info("Attemtping to apply patch to llama.cpp repo")
134
- run_cmd("git", "-C", lcpp_path, "apply", patch_path)
135
- else:
136
- logging.info("Patch already applied")
137
-
138
- # using cmake here as llama.cpp switched to it completely for new versions
139
- if os.name == "nt":
140
- bin_path = os.path.join(lcpp_path, "build", "bin", "debug", "llama-quantize.exe")
141
- else:
142
- bin_path = os.path.join(lcpp_path, "build", "bin", "llama-quantize")
143
-
144
- if not os.path.isfile(bin_path) or force_update or need_update:
145
- if run_cmd("cmake", "--version") != 0:
146
- raise RuntimeError("Can't find cmake! Make sure you have a working build environment set up")
147
-
148
- build_path = os.path.join(lcpp_path, "build")
149
- os.makedirs(build_path, exist_ok=True)
150
- logging.info("Attempting to build llama.cpp binary from source")
151
- run_cmd("cmake", "-B", build_path, lcpp_path)
152
- run_cmd("cmake", "--build", build_path, "--config", "Debug", "-j4", "--target", "llama-quantize")
153
- if not os.path.isfile(bin_path):
154
- raise RuntimeError("Build failed! Rerun with --debug to see error log.")
155
- else:
156
- logging.info("Binary already present")
157
-
158
- return bin_path
159
-
160
- def get_output_dir():
161
- root = os.path.dirname(os.path.abspath(__file__))
162
- root = os.path.normpath(root)
163
- split = os.path.split(root)
164
- while split[1]:
165
- if split[1] == "ComfyUI":
166
- if os.path.isdir(os.path.join(*split, "models", "unet")): # new
167
- root = os.path.join(*split, "models", "unet", "gguf")
168
- logging.info(f"Found ComfyUI, using model folder: {root}")
169
- return root
170
-
171
- if os.path.isdir(os.path.join(*split, "models", "diffusion_models")): # old
172
- root = os.path.join(*split, "models", "diffusion_models", "gguf")
173
- logging.info(f"Found ComfyUI, using model folder: {root}")
174
- return root
175
-
176
- logging.info("Found ComfyUI, but can't find model folder")
177
- break
178
-
179
- split = os.path.split(split[0])
180
-
181
- root = os.path.join(root, "models")
182
- logging.info(f"Defaulting to [script dir]/models: {root}")
183
- return root
184
-
185
- def get_hf_fake_sd(repo, path, device=torch.device("meta")):
186
- sd = {}
187
- meta = hf.parse_safetensors_file_metadata(repo, path)
188
- for key, raw in meta.tensors.items():
189
- shape = tuple(raw.shape)
190
- dtype = dtype_dict.get(raw.dtype, torch.float32)
191
- sd[key] = torch.zeros(shape, dtype=dtype, device=device)
192
- return sd
193
-
194
- def get_hf_file_arch(repo, path):
195
- pattern = r'(\d+)-of-(\d+)'
196
- match = re.search(pattern, path)
197
-
198
- if match:
199
- # we need to load it as multipart
200
- if int(match.group(1)) != 1:
201
- return None
202
- sd = {}
203
- for k in range(int(match.group(2))):
204
- shard_path = path.replace(match.group(1), f"{k+1:0{len(match.group(1))}}")
205
- sd.update(get_hf_fake_sd(repo, shard_path))
206
- else:
207
- sd = get_hf_fake_sd(repo, path)
208
-
209
- # this should raise an error on failure
210
- sd = strip_prefix(sd)
211
- model_arch = detect_arch(sd)
212
-
213
- # this is for SDXL and SD1.5, I want to overhaul this logic to match sd.cpp eventually
214
- assert not model_arch.shape_fix, "Model uses shape fix (SDXL/SD1) - unsupported for now."
215
- return model_arch.arch
216
-
217
- def get_hf_valid_files(repo):
218
- # TODO: probably tweak this?
219
- MIN_SIZE_GB = 1
220
- VALID_SRC_EXTS = [".safetensors", ] # ".pt", ".ckpt", ]
221
- meta = hf.model_info(repo, files_metadata=True)
222
-
223
- valid = {}
224
- for file in meta.siblings:
225
- path = file.rfilename
226
- fname = os.path.basename(path)
227
- name, ext = os.path.splitext(fname)
228
-
229
- if ext.lower() not in VALID_SRC_EXTS:
230
- logging.debug(f"Invalid ext: {path} {ext}")
231
- continue
232
-
233
- if file.size / (1024 ** 3) < MIN_SIZE_GB:
234
- logging.debug(f"File too small: {path} {file.size}")
235
- continue
236
-
237
- try:
238
- arch = get_hf_file_arch(repo, path)
239
- except Exception as e:
240
- logging.warning(f"Arch detect fail: {e} ({path})")
241
- else:
242
- if arch is not None:
243
- valid[path] = arch
244
- logging.info(f"Found '{arch}' model at path {path}")
245
- return valid
246
-
247
- def make_base_quant(src, output_dir, temp_dir, final=True, resume=True):
248
- name, ext = os.path.splitext(os.path.basename(src))
249
- if ext == ".gguf":
250
- logging.info("Input file already in gguf, assuming base quant")
251
- return None, src, None
252
-
253
- name = name.lower() # uncomment to preserve case in all quants
254
- dst_tmp = os.path.join(temp_dir, f"{name}-{{ftype}}.gguf") # ftype is filled in by convert.py
255
-
256
- tmp_path, model_arch, fix_path = convert_file(src, dst_tmp, interact=False, overwrite=False)
257
- dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
258
- if os.path.isfile(dst_path):
259
- if resume:
260
- logging.warning("Resuming with interrupted base quant, may be incorrect!")
261
- return dst_path, tmp_path, fix_path
262
- raise OSError(f"Output already exists! Clear folder? {dst_path}")
263
-
264
- if fix_path is not None and os.path.isfile(fix_path):
265
- quant_source = tmp_path
266
- if final:
267
- apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
268
- else:
269
- dst_path = None
270
- else:
271
- fix_path = None
272
- if final:
273
- os.rename(tmp_path, dst_path)
274
- quant_source = dst_path
275
- else:
276
- dst_path = None
277
- quant_source = tmp_path
278
-
279
- return dst_path, quant_source, fix_path
280
-
281
- def make_quant(src, output_dir, temp_dir, qtype, quantize_binary, fix_path=None, resume=True):
282
- name, ext = os.path.splitext(os.path.basename(src))
283
- assert ext.lower() == ".gguf", "Invalid input file"
284
-
285
- src_qtext = [x for x in ["-F32.gguf", "-F16.gguf", "-BF16.gguf"] if x in src]
286
- if len(src_qtext) == 1:
287
- tmp_path = os.path.join(
288
- temp_dir,
289
- os.path.basename(src).replace(src_qtext[0], f"-{qtype.upper()}.gguf")
290
- )
291
- else:
292
- tmp_path = os.path.join(
293
- temp_dir,
294
- f"{name}-{qtype.upper()}.gguf"
295
- )
296
- tmp_path = os.path.abspath(tmp_path)
297
- dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
298
- if os.path.isfile(dst_path):
299
- if resume:
300
- return dst_path
301
- raise OSError("Output already exists! Clear folder?")
302
-
303
- r = run_cmd(quantize_binary, src, tmp_path, qtype, log_error=True)
304
- time.sleep(2) # leave time for file sync?
305
- if r != 0:
306
- raise RuntimeError(f"Quantization failed with error code {r}")
307
-
308
- if fix_path is not None:
309
- apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
310
- if os.path.isfile(dst_path) and os.path.isfile(tmp_path):
311
- os.remove(tmp_path)
312
- else:
313
- os.rename(tmp_path, dst_path)
314
-
315
- return dst_path
316
-
317
- if __name__ == "__main__":
318
- args = get_args()
319
- os.makedirs(args.output_dir, exist_ok=True)
320
- os.makedirs(args.temp_dir, exist_ok=True)
321
- quantize_binary = setup_utils(args.force_update)
322
-
323
- try:
324
- from convert import detect_arch, strip_prefix, convert_file
325
- from fix_5d_tensors import apply_5d_fix
326
- except [ImportError, ModuleNotFoundError] as e:
327
- raise ImportError(f"Can't import required utils: {e}")
328
-
329
- if not os.path.isfile(args.src):
330
- # huggingface repo. TODO: file choice
331
- if len(args.src.split("/")) != "1":
332
- raise OSError(f"Invalid huggingface repo or model path {args.src}")
333
- raise NotImplementedError("HF not yet supported")
334
- # download then set to temp file
335
- # hf_repo = "Lightricks/LTX-Video" # "fal/AuraFlow-v0.3"
336
- # get_hf_valid_files(hf_repo)
337
- # args.src = ...
338
-
339
- out_files = []
340
-
341
- base_quant, quant_source, fix_path = make_base_quant(
342
- args.src,
343
- args.output_dir,
344
- args.temp_dir,
345
- final=("base" in args.quants),
346
- resume=args.resume,
347
- )
348
- if "base" in args.quants:
349
- args.quants = [x for x in args.quants if x not in ["base"]]
350
- if base_quant is not None:
351
- out_files.append(base_quant)
352
-
353
- for qtype in args.quants:
354
- out_files.append(make_quant(
355
- quant_source,
356
- args.output_dir,
357
- args.temp_dir,
358
- qtype,
359
- quantize_binary,
360
- fix_path,
361
- resume=args.resume,
362
- ))
363
-
364
- if fix_path is not None and os.path.isfile(fix_path):
365
- os.remove(fix_path)
366
-
367
- if base_quant != quant_source:
368
- # make sure our quant source is in the temp folder before removing it
369
- cc = os.path.commonpath([os.path.normpath(quant_source), os.path.normpath(args.temp_dir)])
370
- if cc == os.path.normpath(args.temp_dir):
371
- os.remove(quant_source)
372
-
373
- out_file_str = '\n'.join(out_files)
374
- logging.info(f"Output file(s): {out_file_str}")