Delete tools

Browse files

Files changed (6) hide show

tools/convert.py +0 -412
tools/fix_5d_tensors.py +0 -85
tools/fix_lines_ending.py +0 -31
tools/lcpp.patch +0 -499
tools/read_tensors.py +0 -21
tools/tool_auto.py +0 -374

tools/convert.py DELETED Viewed

@@ -1,412 +0,0 @@
-# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
-import os
-import gguf
-import torch
-import logging
-import argparse
-from tqdm import tqdm
-from safetensors.torch import load_file, save_file
-QUANTIZATION_THRESHOLD = 1024
-REARRANGE_THRESHOLD = 512
-MAX_TENSOR_NAME_LENGTH = 127
-MAX_TENSOR_DIMS = 4
-class ModelTemplate:
-    arch = "invalid"  # string describing architecture
-    shape_fix = False # whether to reshape tensors
-    ndims_fix = False # whether to save fix file for tensors exceeding max dims
-    keys_detect = []  # list of lists to match in state dict
-    keys_banned = []  # list of keys that should mark model as invalid for conversion
-    keys_hiprec = []  # list of keys that need to be kept in fp32 for some reason
-    keys_ignore = []  # list of strings to ignore keys by when found
-class ModelFlux(ModelTemplate):
-    arch = "flux"
-    keys_detect = [
-        ("single_transformer_blocks.0.attn.norm_k.weight",),
-        ("double_blocks.0.img_attn.proj.weight",),
-    ]
-    keys_banned = ["single_transformer_blocks.0.attn.norm_k.weight",]
-class ModelSD3(ModelTemplate):
-    arch = "sd3"
-    keys_detect = [
-        ("transformer_blocks.0.ff_context.net.0.proj.weight",),
-        ("joint_blocks.0.x_block.attn.qkv.weight",),
-    ]
-    keys_banned = ["transformer_blocks.0.ff_context.net.0.proj.weight",]
-class ModelAura(ModelTemplate):
-    arch = "aura"
-    keys_detect = [
-        ("double_layers.3.modX.1.weight",),
-        ("joint_transformer_blocks.3.ff_context.out_projection.weight",),
-    ]
-    keys_banned = ["joint_transformer_blocks.3.ff_context.out_projection.weight",]
-class ModelHiDream(ModelTemplate):
-    arch = "hidream"
-    keys_detect = [
-        (
-            "caption_projection.0.linear.weight",
-            "double_stream_blocks.0.block.ff_i.shared_experts.w3.weight"
-        )
-    ]
-    keys_hiprec = [
-        # nn.parameter, can't load from BF16 ver
-        ".ff_i.gate.weight",
-        "img_emb.emb_pos"
-    ]
-class ModelCosmosPredict2(ModelTemplate):
-    arch = "cosmos"
-    keys_detect = [
-        (
-            "blocks.0.mlp.layer1.weight",
-            "blocks.0.adaln_modulation_cross_attn.1.weight",
-        )
-    ]
-    keys_hiprec = ["pos_embedder"]
-    keys_ignore = ["_extra_state", "accum_"]
-class ModelQwenImage(ModelTemplate):
-    arch = "qwen_image"
-    keys_detect = [
-        (
-            "time_text_embed.timestep_embedder.linear_2.weight",
-            "transformer_blocks.0.attn.norm_added_q.weight",
-            "transformer_blocks.0.img_mlp.net.0.proj.weight",
-        )
-    ]
-class ModelHyVid(ModelTemplate):
-    arch = "hyvid"
-    ndims_fix = True
-    keys_detect = [
-        (
-            "double_blocks.0.img_attn_proj.weight",
-            "txt_in.individual_token_refiner.blocks.1.self_attn_qkv.weight",
-        )
-    ]
-class ModelWan(ModelTemplate):
-    arch = "wan"
-    ndims_fix = True
-    keys_detect = [
-        (
-            "blocks.0.self_attn.norm_q.weight",
-            "text_embedding.2.weight",
-            "head.modulation",
-        )
-    ]
-    keys_hiprec = [
-        ".modulation", # nn.parameter, can't load from BF16 ver
-        ".encoder.padding_tokens", # nn.parameter, specific to S2V
-        "trainable_cond_mask", # used directly w/ .weight
-        "casual_audio_encoder.weights", # nn.parameter, specific to S2V
-        "casual_audio_encoder.encoder.conv", # CausalConv1d doesn't use ops.py for now
-    ]
-class ModelLTXV(ModelTemplate):
-    arch = "ltxv"
-    keys_detect = [
-        (
-            "adaln_single.emb.timestep_embedder.linear_2.weight",
-            "transformer_blocks.27.scale_shift_table",
-            "caption_projection.linear_2.weight",
-        )
-    ]
-    keys_hiprec = [
-        "scale_shift_table" # nn.parameter, can't load from BF16 base quant
-    ]
-class ModelSDXL(ModelTemplate):
-    arch = "sdxl"
-    shape_fix = True
-    keys_detect = [
-        ("down_blocks.0.downsamplers.0.conv.weight", "add_embedding.linear_1.weight",),
-        (
-            "input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight",
-            "output_blocks.2.2.conv.weight", "output_blocks.5.2.conv.weight",
-        ), # Non-diffusers
-        ("label_emb.0.0.weight",),
-    ]
-class ModelSD1(ModelTemplate):
-    arch = "sd1"
-    shape_fix = True
-    keys_detect = [
-        ("down_blocks.0.downsamplers.0.conv.weight",),
-        (
-            "input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight", "input_blocks.9.0.op.weight",
-            "output_blocks.2.1.conv.weight", "output_blocks.5.2.conv.weight", "output_blocks.8.2.conv.weight"
-        ), # Non-diffusers
-    ]
-class ModelLumina2(ModelTemplate):
-    arch = "lumina2"
-    keys_detect = [
-        ("cap_embedder.1.weight", "context_refiner.0.attention.qkv.weight")
-    ]
-class ModelHuMo(ModelTemplate):
-    arch = "humo"
-    ndims_fix = True
-    keys_detect = [
-        ("blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",),
-        ("audio_proj.audio_proj_glob_1.layer.weight",),
-        (
-            "blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",
-            "blocks.0.self_attn.norm_q.weight",
-            "text_embedding.2.weight",
-            "head.modulation"
-        ),
-    ]
-    keys_hiprec = ["patch_embedding", "text_embedding", "time_embedding", ".modulation"]
-# The architectures are checked in order and the first successful match terminates the search.
-arch_list = [
-    ModelFlux, ModelSD3, ModelAura, ModelHiDream, ModelCosmosPredict2, ModelQwenImage,
-    ModelLTXV, ModelHyVid, ModelHuMo, ModelWan, ModelSDXL, ModelSD1, ModelLumina2
-]
-def is_model_arch(model, state_dict):
-    # check if model is correct
-    matched = False
-    invalid = False
-    # print(state_dict)
-    for match_list in model.keys_detect:
-        if all(key in state_dict for key in match_list):
-            matched = True
-            invalid = any(key in state_dict for key in model.keys_banned)
-            break
-    assert not invalid, f"Model architecture not allowed for conversion! (i.e. reference VS diffusers format) [arch:{model.arch}]"
-    return matched
-def detect_arch(state_dict):
-    model_arch = None
-    for arch in arch_list:
-        if is_model_arch(arch, state_dict):
-            model_arch = arch()
-            break
-    assert model_arch is not None, "Unknown model architecture!"
-    return model_arch
-def parse_args():
-    parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
-    parser.add_argument("--src", required=True, help="Source model ckpt file.")
-    parser.add_argument("--dst", help="Output unet gguf file.")
-    args = parser.parse_args()
-    if not os.path.isfile(args.src):
-        parser.error("No input provided!")
-    return args
-def strip_prefix(state_dict):
-    # prefix for mixed state dict
-    prefix = None
-    for pfx in ["model.diffusion_model.", "model."]:
-        if any([x.startswith(pfx) for x in state_dict.keys()]):
-            prefix = pfx
-            break
-    # prefix for uniform state dict
-    if prefix is None:
-        for pfx in ["net."]:
-            if all([x.startswith(pfx) for x in state_dict.keys()]):
-                prefix = pfx
-                break
-    # strip prefix if found
-    if prefix is not None:
-        logging.info(f"State dict prefix found: '{prefix}'")
-        sd = {}
-        for k, v in state_dict.items():
-            if prefix not in k:
-                continue
-            k = k.replace(prefix, "")
-            sd[k] = v
-    else:
-        logging.debug("State dict has no prefix")
-        sd = state_dict
-    return sd
-def find_main_dtype(state_dict, allow_fp32=False):
-    # detect most common dtype in input
-    dtypes = [x.dtype for x in state_dict.values()]
-    dtypes = {x:dtypes.count(x) for x in set(dtypes)}
-    main_dtype = max(dtypes, key=dtypes.get)
-    if main_dtype == torch.bfloat16:
-        ftype_name = "BF16"
-        ftype_gguf = gguf.LlamaFileType.MOSTLY_BF16
-    elif main_dtype == torch.float32 and allow_fp32:
-        ftype_name = "F32"
-        ftype_gguf = gguf.LlamaFileType.ALL_F32
-    else:
-        ftype_name = "F16"
-        ftype_gguf = gguf.LlamaFileType.MOSTLY_F16
-    return ftype_name, ftype_gguf
-def load_state_dict(path):
-    if any(path.endswith(x) for x in [".ckpt", ".pt", ".bin", ".pth"]):
-        state_dict = torch.load(path, map_location="cpu", weights_only=True)
-        for subkey in ["model", "module"]:
-            if subkey in state_dict:
-                state_dict = state_dict[subkey]
-                break
-        if len(state_dict) < 20:
-            raise RuntimeError(f"pt subkey load failed: {state_dict.keys()}")
-    else:
-        state_dict = load_file(path)
-    return strip_prefix(state_dict)
-def handle_tensors(writer, state_dict, model_arch, allow_fp32=False):
-    name_lengths = tuple(sorted(
-        ((key, len(key)) for key in state_dict.keys()),
-        key=lambda item: item[1],
-        reverse=True,
-    ))
-    if not name_lengths:
-        return
-    max_name_len = name_lengths[0][1]
-    if max_name_len > MAX_TENSOR_NAME_LENGTH:
-        bad_list = ", ".join(f"{key!r} ({namelen})" for key, namelen in name_lengths if namelen > MAX_TENSOR_NAME_LENGTH)
-        raise ValueError(f"Can only handle tensor names up to {MAX_TENSOR_NAME_LENGTH} characters. Tensors exceeding the limit: {bad_list}")
-    invalid_tensors = {}
-    quantized_tensors = {}
-    for key, data in tqdm(state_dict.items()):
-        old_dtype = data.dtype
-        if any(x in key for x in model_arch.keys_ignore):
-            tqdm.write(f"Filtering ignored key: '{key}'")
-            continue
-        if data.dtype == torch.bfloat16:
-            data = data.to(torch.float32).numpy()
-        # this is so we don't break torch 2.0.X
-        elif data.dtype in [getattr(torch, "float8_e4m3fn", "_invalid"), getattr(torch, "float8_e5m2", "_invalid")]:
-            data = data.to(torch.float16).numpy()
-        else:
-            data = data.numpy()
-        n_dims = len(data.shape)
-        data_shape = data.shape
-        if old_dtype == torch.bfloat16:
-            data_qtype = gguf.GGMLQuantizationType.BF16
-        elif old_dtype == torch.float32 and allow_fp32:
-            data_qtype = gguf.GGMLQuantizationType.F32
-        else:
-            data_qtype = gguf.GGMLQuantizationType.F16
-        # The max no. of dimensions that can be handled by the quantization code is 4
-        if len(data.shape) > MAX_TENSOR_DIMS:
-            invalid_tensors[key] = data
-            continue # needs to be added back later
-        # get number of parameters (AKA elements) in this tensor
-        n_params = 1
-        for dim_size in data_shape:
-            n_params *= dim_size
-        if old_dtype in (torch.float32, torch.bfloat16):
-            if n_dims == 1:
-                # one-dimensional tensors should be kept in F32
-                # also speeds up inference due to not dequantizing
-                data_qtype = gguf.GGMLQuantizationType.F32
-            elif n_params <= QUANTIZATION_THRESHOLD:
-                # very small tensors
-                data_qtype = gguf.GGMLQuantizationType.F32
-            elif any(x in key for x in model_arch.keys_hiprec):
-                # tensors that require max precision
-                data_qtype = gguf.GGMLQuantizationType.F32
-        if (model_arch.shape_fix                        # NEVER reshape for models such as flux
-            and n_dims > 1                              # Skip one-dimensional tensors
-            and n_params >= REARRANGE_THRESHOLD         # Only rearrange tensors meeting the size requirement
-            and (n_params / 256).is_integer()           # Rearranging only makes sense if total elements is divisible by 256
-            and not (data.shape[-1] / 256).is_integer() # Only need to rearrange if the last dimension is not divisible by 256
-        ):
-            orig_shape = data.shape
-            data = data.reshape(n_params // 256, 256)
-            writer.add_array(f"comfy.gguf.orig_shape.{key}", tuple(int(dim) for dim in orig_shape))
-        try:
-            data = gguf.quants.quantize(data, data_qtype)
-            quantized_tensors[key] = data_qtype
-        except (AttributeError, gguf.QuantError) as e:
-            tqdm.write(f"falling back to F16: {e}")
-            data_qtype = gguf.GGMLQuantizationType.F16
-            data = gguf.quants.quantize(data, data_qtype)
-            quantized_tensors[key] = data_qtype
-        shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
-        tqdm.write(f"{f'%-{max_name_len + 4}s' % f'{key}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
-        writer.add_tensor(key, data, raw_dtype=data_qtype)
-    return quantized_tensors, invalid_tensors
-def convert_file(path, dst_path=None, interact=True, overwrite=False, allow_fp32=False):
-    # load & run model detection logic
-    state_dict = load_state_dict(path)
-    model_arch = detect_arch(state_dict)
-    logging.info(f"* Architecture detected from input: {model_arch.arch}")
-    ftype_name, ftype_gguf = find_main_dtype(state_dict, allow_fp32=allow_fp32)
-    if dst_path is None:
-        dst_path = f"{os.path.splitext(path)[0]}-{ftype_name}.gguf"
-    elif "{ftype}" in dst_path: # lcpp logic
-        dst_path = dst_path.replace("{ftype}", ftype_name)
-    if os.path.isfile(dst_path) and not overwrite:
-        if interact:
-            input("Output exists enter to continue or ctrl+c to abort!")
-        else:
-            raise OSError("Output exists and overwriting is disabled!")
-    # handle actual file
-    writer = gguf.GGUFWriter(path=None, arch=model_arch.arch)
-    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    if ftype_gguf is not None:
-        writer.add_file_type(ftype_gguf)
-    quantized_tensors, invalid_tensors = handle_tensors(writer, state_dict, model_arch, allow_fp32=allow_fp32)
-    if len(invalid_tensors) > 0:
-        if not model_arch.ndims_fix: # only applies to 5D fix for now, possibly expand to cover more cases?
-            raise ValueError(f"Tensor(s) detected that exceeds dims supported by C++ code! ({invalid_tensors.keys()})")
-        fix_path = os.path.join(
-            os.path.dirname(dst_path),
-            f"fix_5d_tensors_{model_arch.arch}.safetensors"
-        )
-        if os.path.isfile(fix_path):
-            raise RuntimeError(f"Tensor fix file already exists! {path}")
-        invalid_tensors = {k:torch.from_numpy(v.copy()) for k,v in invalid_tensors.items()}
-        save_file(invalid_tensors, fix_path)
-        logging.warning(f"\n### Warning! Fix file found at '{fix_path}'")
-        logging.warning(" you most likely need to run 'fix_5d_tensors.py' after quantization.")
-    else:
-        fix_path = None
-    writer.write_header_to_file(path=dst_path)
-    writer.write_kv_data_to_file()
-    writer.write_tensors_to_file(progress=True)
-    writer.close()
-    return dst_path, model_arch, fix_path
-if __name__ == "__main__":
-    args = parse_args()
-    convert_file(args.src, args.dst)

tools/fix_5d_tensors.py DELETED Viewed

@@ -1,85 +0,0 @@
-# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
-import os
-import gguf
-import torch
-import argparse
-from tqdm import tqdm
-from safetensors.torch import load_file
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", required=True)
-    parser.add_argument("--dst", required=True)
-    parser.add_argument("--fix", required=False, help="Defaults to ./fix_5d_tensors_[arch].pt")
-    parser.add_argument("--overwrite", action="store_true")
-    args = parser.parse_args()
-    if not os.path.isfile(args.src):
-        parser.error(f"Invalid source file '{args.src}'")
-    if not args.overwrite and os.path.exists(args.dst):
-        parser.error(f"Output exists, use '--overwrite' ({args.dst})")
-    return args
-def get_arch_str(reader):
-    field = reader.get_field("general.architecture")
-    return str(field.parts[field.data[-1]], encoding="utf-8")
-def get_file_type(reader):
-    field = reader.get_field("general.file_type")
-    ft = int(field.parts[field.data[-1]])
-    return gguf.LlamaFileType(ft)
-def apply_5d_fix(src, dst, fix=None, overwrite=False):
-    # read existing
-    reader = gguf.GGUFReader(src)
-    arch = get_arch_str(reader)
-    file_type = get_file_type(reader)
-    print(f"Detected arch: '{arch}' (ftype: {str(file_type)})")
-    # prep fix
-    if fix is None:
-        fix = f"./fix_5d_tensors_{arch}.safetensors"
-    if not os.path.isfile(fix):
-        raise OSError(f"No 5D tensor fix file: {fix}")
-    sd5d = load_file(fix)
-    sd5d = {k:v.numpy() for k,v in sd5d.items()}
-    print("5D tensors:", sd5d.keys())
-    # prep output
-    writer = gguf.GGUFWriter(path=None, arch=arch)
-    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    writer.add_file_type(file_type)
-    global added
-    added = []
-    def add_extra_key(writer, key, data):
-        global added
-        data_qtype = gguf.GGMLQuantizationType.F32
-        data = gguf.quants.quantize(data, data_qtype)
-        tqdm.write(f"Adding key {key} ({data.shape})")
-        writer.add_tensor(key, data, raw_dtype=data_qtype)
-        added.append(key)
-    # main loop to add missing 5D tensor(s)
-    for tensor in tqdm(reader.tensors):
-        writer.add_tensor(tensor.name, tensor.data, raw_dtype=tensor.tensor_type)
-        key5d = tensor.name.replace(".bias", ".weight")
-        if key5d in sd5d.keys():
-            add_extra_key(writer, key5d, sd5d[key5d])
-    # brute force for any missed
-    for key, data in sd5d.items():
-        if key not in added:
-            add_extra_key(writer, key, data)
-    writer.write_header_to_file(path=dst)
-    writer.write_kv_data_to_file()
-    writer.write_tensors_to_file(progress=True)
-    writer.close()
-if __name__ == "__main__":
-    args = get_args()
-    apply_5d_fix(args.src, args.dst, fix=args.fix, overwrite=args.overwrite)

tools/fix_lines_ending.py DELETED Viewed

@@ -1,31 +0,0 @@
-import os
-files = ["lcpp.patch", "lcpp_sd3.patch"]
-def has_unix_line_endings(file_path):
-    try:
-        with open(file_path, 'rb') as file:
-            content = file.read()
-        return b'\r\n' not in content
-    except Exception as e:
-        print(f"Error checking '{file_path}': {e}")
-        return False
-def convert_to_linux_format(file_path):
-    try:
-        with open(file_path, 'rb') as file:
-            content = file.read().replace(b'\r\n', b'\n')
-        with open(file_path, 'wb') as file:
-            file.write(content)
-        print(f"'{file_path}' converted to Linux line endings (LF).")
-    except Exception as e:
-        print(f"Error processing '{file_path}': {e}")
-for file in files:
-    if os.path.exists(file):
-        if has_unix_line_endings(file):
-            print(f"'{file}' already has Unix line endings (LF). No conversion needed.")
-        else:
-            convert_to_linux_format(file)
-    else:
-        print(f"File '{file}' does not exist.")

tools/lcpp.patch DELETED Viewed

@@ -1,499 +0,0 @@
-diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index de3c706f..0267c1fa 100644
---- a/ggml/include/ggml.h
-+++ b/ggml/include/ggml.h
-@@ -223,7 +223,7 @@
- #define GGML_MAX_OP_PARAMS      64
- #ifndef GGML_MAX_NAME
--#   define GGML_MAX_NAME        64
-+#   define GGML_MAX_NAME        128
- #endif
- #define GGML_DEFAULT_N_THREADS  4
-@@ -2449,6 +2449,7 @@ extern "C" {
-     // manage tensor info
-     GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
-+    GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
-     GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
-     GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
-diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index b16c462f..6d1568f1 100644
---- a/ggml/src/ggml.c
-+++ b/ggml/src/ggml.c
-@@ -22960,6 +22960,14 @@ void gguf_add_tensor(
-     ctx->header.n_tensors++;
- }
-+void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
-+    const int idx = gguf_find_tensor(ctx, name);
-+    if (idx < 0) {
-+        GGML_ABORT("tensor not found");
-+    }
-+    ctx->infos[idx].n_dims = n_dim;
-+}
-+
- void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
-     const int idx = gguf_find_tensor(ctx, name);
-     if (idx < 0) {
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 24e1f1f0..8a1e9ef8 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -205,6 +205,18 @@ enum llm_arch {
-     LLM_ARCH_GRANITE,
-     LLM_ARCH_GRANITE_MOE,
-     LLM_ARCH_CHAMELEON,
-+    LLM_ARCH_FLUX,
-+    LLM_ARCH_SD1,
-+    LLM_ARCH_SDXL,
-+    LLM_ARCH_SD3,
-+    LLM_ARCH_AURA,
-+    LLM_ARCH_LTXV,
-+    LLM_ARCH_HYVID,
-+    LLM_ARCH_WAN,
-+    LLM_ARCH_HIDREAM,
-+    LLM_ARCH_COSMOS,
-+    LLM_ARCH_LUMINA2,
-+    LLM_ARCH_QWEN_IMAGE,
-     LLM_ARCH_UNKNOWN,
- };
-@@ -258,6 +270,18 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_GRANITE,         "granite"      },
-     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
-     { LLM_ARCH_CHAMELEON,       "chameleon"    },
-+    { LLM_ARCH_FLUX,            "flux"         },
-+    { LLM_ARCH_SD1,             "sd1"          },
-+    { LLM_ARCH_SDXL,            "sdxl"         },
-+    { LLM_ARCH_SD3,             "sd3"          },
-+    { LLM_ARCH_AURA,            "aura"         },
-+    { LLM_ARCH_LTXV,            "ltxv"         },
-+    { LLM_ARCH_HYVID,           "hyvid"        },
-+    { LLM_ARCH_WAN,             "wan"          },
-+    { LLM_ARCH_HIDREAM,         "hidream"      },
-+    { LLM_ARCH_COSMOS,          "cosmos"       },
-+    { LLM_ARCH_LUMINA2,         "lumina2"      },
-+    { LLM_ARCH_QWEN_IMAGE,      "qwen_image"   },
-     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
- };
-@@ -1531,6 +1555,18 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
-         },
-     },
-+    { LLM_ARCH_FLUX,       {}},
-+    { LLM_ARCH_SD1,        {}},
-+    { LLM_ARCH_SDXL,       {}},
-+    { LLM_ARCH_SD3,        {}},
-+    { LLM_ARCH_AURA,       {}},
-+    { LLM_ARCH_LTXV,       {}},
-+    { LLM_ARCH_HYVID,      {}},
-+    { LLM_ARCH_WAN,        {}},
-+    { LLM_ARCH_HIDREAM,    {}},
-+    { LLM_ARCH_COSMOS,     {}},
-+    { LLM_ARCH_LUMINA2,    {}},
-+    { LLM_ARCH_QWEN_IMAGE, {}},
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-@@ -5403,6 +5439,26 @@ static void llm_load_hparams(
-     // get general kv
-     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
-+    // Disable LLM metadata for image models
-+    switch (model.arch) {
-+        case LLM_ARCH_FLUX:
-+        case LLM_ARCH_SD1:
-+        case LLM_ARCH_SDXL:
-+        case LLM_ARCH_SD3:
-+        case LLM_ARCH_AURA:
-+        case LLM_ARCH_LTXV:
-+        case LLM_ARCH_HYVID:
-+        case LLM_ARCH_WAN:
-+        case LLM_ARCH_HIDREAM:
-+        case LLM_ARCH_COSMOS:
-+        case LLM_ARCH_LUMINA2:
-+        case LLM_ARCH_QWEN_IMAGE:
-+            model.ftype = ml.ftype;
-+            return;
-+        default:
-+            break;
-+    }
-+
-     // get hparams kv
-     ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
-@@ -18016,6 +18072,158 @@ static void llama_tensor_dequantize_internal(
-     workers.clear();
- }
-+static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
-+    // Special function for quantizing image model tensors
-+    const std::string name = ggml_get_name(tensor);
-+    const llm_arch arch = qs.model.arch;
-+
-+    // Sanity check
-+    if (
-+            (name.find("model.diffusion_model.") != std::string::npos) ||
-+            (name.find("first_stage_model.") != std::string::npos) ||
-+            (name.find("single_transformer_blocks.") != std::string::npos) ||
-+            (name.find("joint_transformer_blocks.") != std::string::npos)
-+        ) {
-+            throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
-+    }
-+
-+    // Unsupported quant types - exclude all IQ quants for now
-+    if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  ||
-+        ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M  ||
-+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-+        ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
-+        ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
-+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_M   || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
-+        ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
-+        throw std::runtime_error("Invalid quantization type for image model (Not supported)");
-+    }
-+
-+    if ( // Rules for to_v attention
-+            (name.find("attn_v.weight") != std::string::npos) ||
-+            (name.find(".to_v.weight") != std::string::npos) ||
-+            (name.find(".v.weight") != std::string::npos) ||
-+            (name.find(".attn.w1v.weight") != std::string::npos) ||
-+            (name.find(".attn.w2v.weight") != std::string::npos) ||
-+            (name.find(".add_v_proj.weight") != std::string::npos) ||
-+            (name.find("_attn.v_proj.weight") != std::string::npos)
-+        ){
-+            if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
-+                new_type = GGML_TYPE_Q3_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-+                new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-+                new_type = GGML_TYPE_Q5_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
-+                new_type = GGML_TYPE_Q6_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
-+                new_type = GGML_TYPE_Q5_K;
-+            }
-+            ++qs.i_attention_wv;
-+    } else if ( // Rules for fused qkv attention
-+            (name.find("attn_qkv.weight") != std::string::npos) ||
-+            (name.find("attn.qkv.weight") != std::string::npos) ||
-+            (name.find("attention.qkv.weight") != std::string::npos)
-+        ) {
-+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-+                new_type = GGML_TYPE_Q4_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-+                new_type = GGML_TYPE_Q5_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
-+                new_type = GGML_TYPE_Q6_K;
-+            }
-+    } else if ( // Rules for ffn
-+            (name.find("ffn_down") != std::string::npos) ||
-+            ((name.find("experts.") != std::string::npos) && (name.find(".w2.weight") != std::string::npos)) ||
-+            (name.find(".ffn.2.weight") != std::string::npos) || // is this even the right way around?
-+            (name.find(".ff.net.2.weight") != std::string::npos) ||
-+            (name.find(".mlp.layer2.weight") != std::string::npos) ||
-+            (name.find(".adaln_modulation_mlp.2.weight") != std::string::npos) ||
-+            (name.find(".feed_forward.w2.weight") != std::string::npos) ||
-+            (name.find(".img_mlp.net.2.weight") != std::string::npos) ||
-+            (name.find(".txt_mlp.net.2.weight") != std::string::npos)
-+        ) {
-+            // TODO: add back `layer_info` with some model specific logic + logic further down
-+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-+                new_type = GGML_TYPE_Q4_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-+                new_type = GGML_TYPE_Q5_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
-+                new_type = GGML_TYPE_Q5_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-+                new_type = GGML_TYPE_Q6_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
-+                new_type = GGML_TYPE_Q6_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
-+                new_type = GGML_TYPE_Q4_1;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
-+                new_type = GGML_TYPE_Q5_1;
-+            }
-+            ++qs.i_ffn_down;
-+    }
-+
-+    // first/last block high precision test
-+    if (arch == LLM_ARCH_QWEN_IMAGE){
-+        if (
-+            (name.find("transformer_blocks.0.") != std::string::npos) ||
-+            (name.find("transformer_blocks.59.") != std::string::npos) // this should be dynamic
-+        ) {
-+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
-+                new_type = GGML_TYPE_Q4_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-+                new_type = GGML_TYPE_Q4_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-+                new_type = GGML_TYPE_Q5_K;
-+            }
-+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
-+                new_type = GGML_TYPE_Q6_K;
-+            }
-+        }
-+    }
-+
-+    // Sanity check for row shape
-+    bool convert_incompatible_tensor = false;
-+    if (new_type == GGML_TYPE_Q2_K    || new_type == GGML_TYPE_Q3_K    || new_type == GGML_TYPE_Q4_K   ||
-+        new_type == GGML_TYPE_Q5_K    || new_type == GGML_TYPE_Q6_K) {
-+        int nx = tensor->ne[0];
-+        int ny = tensor->ne[1];
-+        if (nx % QK_K != 0) {
-+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
-+            convert_incompatible_tensor = true;
-+        } else {
-+            ++qs.n_k_quantized;
-+        }
-+    }
-+    if (convert_incompatible_tensor) {
-+        // TODO: Possibly reenable this in the future
-+        // switch (new_type) {
-+        //     case GGML_TYPE_Q2_K:
-+        //     case GGML_TYPE_Q3_K:
-+        //     case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-+        //     case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-+        //     case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-+        //     default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-+        // }
-+        new_type = GGML_TYPE_F16;
-+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-+        ++qs.n_fallback;
-+    }
-+    return new_type;
-+}
-+
- static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
-     const std::string name = ggml_get_name(tensor);
-@@ -18513,7 +18721,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
-         if (llama_model_has_encoder(&model)) {
-             n_attn_layer *= 3;
-         }
--        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        if (model.arch != LLM_ARCH_HYVID) { // TODO: Check why this fails
-+            GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        }
-     }
-     size_t total_size_org = 0;
-@@ -18547,6 +18757,57 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
-             ctx_outs[i_split] = gguf_init_empty();
-         }
-         gguf_add_tensor(ctx_outs[i_split], tensor);
-+        // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
-+        if (model.arch == LLM_ARCH_SD3) {
-+            const std::string name = ggml_get_name(tensor);
-+            if (name == "pos_embed" && tensor->ne[2] == 1) {
-+                const int n_dim = 3;
-+                gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
-+                LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
-+            }
-+        }
-+        // same goes for auraflow
-+        if (model.arch == LLM_ARCH_AURA) {
-+            const std::string name = ggml_get_name(tensor);
-+            if (name == "positional_encoding" && tensor->ne[2] == 1) {
-+                const int n_dim = 3;
-+                gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
-+                LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
-+            }
-+            if (name == "register_tokens" && tensor->ne[2] == 1) {
-+                const int n_dim = 3;
-+                gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
-+                LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
-+            }
-+        }
-+        // conv3d fails due to max dims - unsure what to do here as we never even reach this check
-+        if (model.arch == LLM_ARCH_HYVID) {
-+            const std::string name = ggml_get_name(tensor);
-+            if (name == "img_in.proj.weight" && tensor->ne[5] != 1 ) {
-+                throw std::runtime_error("img_in.proj.weight size failed for HyVid");
-+            }
-+        }
-+        // All the modulation layers also have dim1, and I think conv3d fails here too but we segfaul way before that...
-+        if (model.arch == LLM_ARCH_WAN) {
-+            const std::string name = ggml_get_name(tensor);
-+            if (name.find(".modulation") != std::string::npos && tensor->ne[2] == 1) {
-+                const int n_dim = 3;
-+                gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
-+                LLAMA_LOG_INFO("\n%s: Correcting shape for Wan: [key:%s]\n", __func__, tensor->name);
-+            }
-+            // FLF2V model only
-+            if (name == "img_emb.emb_pos") {
-+                const int n_dim = 3;
-+                gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
-+                LLAMA_LOG_INFO("\n%s: Correcting shape for Wan FLF2V: [key:%s]\n", __func__, tensor->name);
-+            }
-+            // S2V model only
-+            if (name == "casual_audio_encoder.weights" || name == "casual_audio_encoder.encoder.padding_tokens") {
-+                const int n_dim = 4;
-+                gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
-+                LLAMA_LOG_INFO("\n%s: Correcting shape for Wan S2V: [key:%s]\n", __func__, tensor->name);
-+            }
-+        }
-     }
-     // Set split info if needed
-@@ -18647,6 +18908,124 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
-         // do not quantize relative position bias (T5)
-         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
-+        // rules for image models
-+        bool image_model = false;
-+        if (model.arch == LLM_ARCH_FLUX) {
-+            image_model = true;
-+            quantize &= name.find("txt_in.") == std::string::npos;
-+            quantize &= name.find("img_in.") == std::string::npos;
-+            quantize &= name.find("time_in.") == std::string::npos;
-+            quantize &= name.find("vector_in.") == std::string::npos;
-+            quantize &= name.find("guidance_in.") == std::string::npos;
-+            quantize &= name.find("final_layer.") == std::string::npos;
-+        }
-+        if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
-+            image_model = true;
-+            quantize &= name.find("class_embedding.") == std::string::npos;
-+            quantize &= name.find("time_embedding.") == std::string::npos;
-+            quantize &= name.find("add_embedding.") == std::string::npos;
-+            quantize &= name.find("time_embed.") == std::string::npos;
-+            quantize &= name.find("label_emb.") == std::string::npos;
-+            quantize &= name.find("conv_in.") == std::string::npos;
-+            quantize &= name.find("conv_out.") == std::string::npos;
-+            quantize &= name != "input_blocks.0.0.weight";
-+            quantize &= name != "out.2.weight";
-+        }
-+        if (model.arch == LLM_ARCH_SD3) {
-+            image_model = true;
-+            quantize &= name.find("final_layer.") == std::string::npos;
-+            quantize &= name.find("time_text_embed.") == std::string::npos;
-+            quantize &= name.find("context_embedder.") == std::string::npos;
-+            quantize &= name.find("t_embedder.") == std::string::npos;
-+            quantize &= name.find("y_embedder.") == std::string::npos;
-+            quantize &= name.find("x_embedder.") == std::string::npos;
-+            quantize &= name != "proj_out.weight";
-+            quantize &= name != "pos_embed";
-+        }
-+        if (model.arch == LLM_ARCH_AURA) {
-+            image_model = true;
-+            quantize &= name.find("t_embedder.") == std::string::npos;
-+            quantize &= name.find("init_x_linear.") == std::string::npos;
-+            quantize &= name != "modF.1.weight";
-+            quantize &= name != "cond_seq_linear.weight";
-+            quantize &= name != "final_linear.weight";
-+            quantize &= name != "final_linear.weight";
-+            quantize &= name != "positional_encoding";
-+            quantize &= name != "register_tokens";
-+        }
-+        if (model.arch == LLM_ARCH_LTXV) {
-+            image_model = true;
-+            quantize &= name.find("adaln_single.") == std::string::npos;
-+            quantize &= name.find("caption_projection.") == std::string::npos;
-+            quantize &= name.find("patchify_proj.") == std::string::npos;
-+            quantize &= name.find("proj_out.") == std::string::npos;
-+            quantize &= name.find("scale_shift_table") == std::string::npos; // last block too
-+        }
-+        if (model.arch == LLM_ARCH_HYVID) {
-+            image_model = true;
-+            quantize &= name.find("txt_in.") == std::string::npos;
-+            quantize &= name.find("img_in.") == std::string::npos;
-+            quantize &= name.find("time_in.") == std::string::npos;
-+            quantize &= name.find("vector_in.") == std::string::npos;
-+            quantize &= name.find("guidance_in.") == std::string::npos;
-+            quantize &= name.find("final_layer.") == std::string::npos;
-+        }
-+        if (model.arch == LLM_ARCH_WAN) {
-+            image_model = true;
-+            quantize &= name.find("modulation.") == std::string::npos;
-+            quantize &= name.find("patch_embedding.") == std::string::npos;
-+            quantize &= name.find("text_embedding.") == std::string::npos;
-+            quantize &= name.find("time_projection.") == std::string::npos;
-+            quantize &= name.find("time_embedding.") == std::string::npos;
-+            quantize &= name.find("img_emb.") == std::string::npos;
-+            quantize &= name.find("head.") == std::string::npos;
-+            // S2V
-+            quantize &= name.find("cond_encoder.") == std::string::npos;
-+            quantize &= name.find("frame_packer.") == std::string::npos;
-+            quantize &= name.find("audio_injector.") == std::string::npos;
-+            quantize &= name.find("casual_audio_encoder.") == std::string::npos;
-+            quantize &= name.find("trainable_cond_mask.") == std::string::npos;
-+        }
-+        if (model.arch == LLM_ARCH_HIDREAM) {
-+            image_model = true;
-+            quantize &= name.find("p_embedder.") == std::string::npos;
-+            quantize &= name.find("t_embedder.") == std::string::npos;
-+            quantize &= name.find("x_embedder.") == std::string::npos;
-+            quantize &= name.find("final_layer.") == std::string::npos;
-+            quantize &= name.find(".ff_i.gate.weight") == std::string::npos;
-+            quantize &= name.find("caption_projection.") == std::string::npos;
-+        }
-+        if (model.arch == LLM_ARCH_COSMOS) {
-+            image_model = true;
-+            quantize &= name.find("p_embedder.") == std::string::npos;
-+            quantize &= name.find("t_embedder.") == std::string::npos;
-+            quantize &= name.find("t_embedding_norm.") == std::string::npos;
-+            quantize &= name.find("x_embedder.") == std::string::npos;
-+            quantize &= name.find("pos_embedder.") == std::string::npos;
-+            quantize &= name.find("final_layer.") == std::string::npos;
-+        }
-+        if (model.arch == LLM_ARCH_LUMINA2) {
-+            image_model = true;
-+            quantize &= name.find("t_embedder.") == std::string::npos;
-+            quantize &= name.find("x_embedder.") == std::string::npos;
-+            quantize &= name.find("final_layer.") == std::string::npos;
-+            quantize &= name.find("cap_embedder.") == std::string::npos;
-+            quantize &= name.find("context_refiner.") == std::string::npos;
-+            quantize &= name.find("noise_refiner.") == std::string::npos;
-+        }
-+        if (model.arch == LLM_ARCH_QWEN_IMAGE) {
-+            image_model = true;
-+            quantize &= name.find("img_in.") == std::string::npos;
-+            quantize &= name.find("txt_in.") == std::string::npos;
-+            quantize &= name.find("time_text_embed.") == std::string::npos;
-+            quantize &= name.find("proj_out.") == std::string::npos;
-+            quantize &= name.find("norm_out.") == std::string::npos;
-+        }
-+        // ignore 3D/4D tensors for image models as the code was never meant to handle these
-+        if (image_model) {
-+            quantize &= ggml_n_dims(tensor) == 2;
-+        }
-+
-         enum ggml_type new_type;
-         void * new_data;
-         size_t new_size;
-@@ -18655,6 +19034,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
-             new_type = default_type;
-             // get more optimal quantization type based on the tensor shape, layer, etc.
-+            if (image_model) {
-+                new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
-+            } else {
-             if (!params->pure && ggml_is_quantized(default_type)) {
-                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-             }
-@@ -18664,6 +19046,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
-             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                 new_type = params->output_tensor_type;
-             }
-+            }
-             // If we've decided to quantize to the same type the tensor is already
-             // in then there's nothing to do.

tools/read_tensors.py DELETED Viewed

@@ -1,21 +0,0 @@
-#!/usr/bin/python3
-import os
-import sys
-import gguf
-def read_tensors(path):
-    reader = gguf.GGUFReader(path)
-    for tensor in reader.tensors:
-        if tensor.tensor_type == gguf.GGMLQuantizationType.F32:
-            continue
-        print(f"{str(tensor.tensor_type):32}: {tensor.name}")
-try:
-    path = sys.argv[1]
-    assert os.path.isfile(path), "Invalid path"
-    print(f"input: {path}")
-except Exception as e:
-    input(f"failed: {e}")
-else:
-    read_tensors(path)
-    input()

tools/tool_auto.py DELETED Viewed

@@ -1,374 +0,0 @@
-# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
-import os
-import re
-import sys
-import time
-import torch
-import logging
-import argparse
-import subprocess
-import huggingface_hub as hf
-logging.getLogger().setLevel(logging.DEBUG)
-qtypes =[
-    # "F16", "BF16",
-    "Q8_0", "Q6_K",
-    "Q5_K_M", "Q5_K_S", "Q5_1", "Q5_0",
-    "Q4_K_M", "Q4_K_S", "Q4_1", "Q4_0",
-    "Q3_K_M", "Q3_K_S", "Q2_K"
-]
-dtype_dict = {
-    "F32": torch.float32,
-    "F16": torch.float16,
-    "BF16": torch.bfloat16,
-    "F8_E4M3": getattr(torch, "float8_e4m3fn", "_invalid"),
-    "F8_E5M2": getattr(torch, "float8_e5m2", "_invalid"),
-}
-# this is pretty jank but I want to be able to run it on a blank instance w/o setup
-terraform_dict = {
-    "repo": "city96/ComfyUI-GGUF",
-    "target": "auto_convert",
-    "lcpp_repo": "ggerganov/llama.cpp",
-    "lcpp_target": "tags/b3962",
-}
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", required=True, help="Source model file or huggingface repo name")
-    parser.add_argument("--quants", nargs="+", choices=["all", "base", *qtypes], default=["Q8_0"])
-    parser.add_argument("--output-dir", default=None, help="Location for output files, defaults to current dir or ComfyUI model dir.")
-    parser.add_argument("--temp-dir", default=None, help="Location for temp files, defaults to [output_dir]/tmp")
-    parser.add_argument("--force-update", action="store_true", help="Force update & rebuild entire quantization stack.")
-    parser.add_argument("--resume", action="store_true", help="Skip over existing files. Will NOT check for broken/interrupted files.")
-    args = parser.parse_args()
-    if args.output_dir is None:
-        args.output_dir = get_output_dir()
-    if args.temp_dir is None:
-        args.temp_dir = os.path.join(args.output_dir, "tmp")
-    if os.path.isdir(args.temp_dir) and len(os.listdir(args.temp_dir)) > 0:
-        raise OSError("Output temp folder not empty!")
-    if "all" in args.quants:
-        args.quants = ["base", *qtypes]
-    return args
-def run_cmd(*args, log_error=False):
-    logging.debug(f"cmd: {args}")
-    try:
-        log = subprocess.run(args, capture_output=True, text=True)
-    except Exception as e:
-        logging.warning(f"{args[0]}, {e}")
-        return -1
-    if log.returncode != 0 and log_error:
-        logging.warning(f"{args[0]}: {log.stdout} {log.stderr}")
-    else:
-        logging.debug(f"{args[0]}: {repr(log.stdout)} {repr(log.stderr.strip())} RET:{log.returncode}")
-    return log.returncode
-def setup_utils(force_update=False):
-    # get ComfyUI-GGUF if missing, then compile patched llama.cpp if required
-    root = os.path.dirname(os.path.abspath(__file__))
-    root = os.path.normpath(root)
-    if os.path.split(root)[1] != "tools":
-        cg_dir = os.path.join(root, "ComfyUI-GGUF")
-        if not os.path.isdir(cg_dir):
-            logging.warning(f"Running outside tools folder! Cloning to {cg_dir}")
-            run_cmd("git", "clone", f"https://github.com/{terraform_dict['repo']}", cg_dir)
-            need_update = True
-        else:
-            need_update = False
-        if force_update or need_update:
-            if terraform_dict['target']:
-                logging.info(f"Attemtping to check out ComfyUI-GGUF branch {terraform_dict['target']}")
-                run_cmd("git", "-C", cg_dir, "checkout", terraform_dict['target'])
-            logging.info("Attemtping to git pull ComfyUI-GGUF to latest")
-            run_cmd("git", "-C", cg_dir, "pull")
-        tools_dir = os.path.join(root, "ComfyUI-GGUF", "tools")
-        sys.path.append(tools_dir) # to make import(s) work
-    else:
-        # TODO: Git pull here too?
-        logging.warning(f"Assuming latest ComfyUI-GGUF. Please git pull & check out branch {terraform_dict['target']} manually!")
-        tools_dir = root
-    if not os.path.isdir(tools_dir):
-        raise OSError(f"Can't find tools subfoder in ComfyUI-GGUF at {tools_dir}")
-    convert_path = os.path.join(tools_dir, "convert.py")
-    if not os.path.isfile(convert_path):
-        raise OSError(f"Cannot find convert.py at location: {convert_path}")
-    lcpp_path = os.path.join(root, "llama.cpp.auto") # avoid messing with regular dir
-    if not os.path.isdir(lcpp_path):
-        logging.info(f"Attemtping to clone llama.cpp repo to {lcpp_path}")
-        run_cmd("git", "clone", f"https://github.com/{terraform_dict['lcpp_repo']}", lcpp_path)
-        need_update = True
-    else:
-        need_update = False
-    if force_update or need_update:
-        # TODO: check reflog and/or git reset before checkout?
-        logging.info(f"Attemtping to check out llama.cpp target {terraform_dict['lcpp_target']}")
-        run_cmd("git", "-C", lcpp_path, "checkout", terraform_dict['lcpp_target'])
-        # TODO: git reset before patch?
-        patch_path = os.path.join(tools_dir, "lcpp.patch")
-        # patch (probably) has wrong file endings:
-        logging.info("Converting patch file endings")
-        with open(patch_path, "rb") as file:
-            content = file.read().replace(b"\r\n", b"\n")
-        with open(patch_path, "wb") as file:
-            file.write(content)
-        if run_cmd("git", "-C", lcpp_path, "apply", "--check", "-R", patch_path) != 0:
-            logging.info("Attemtping to apply patch to llama.cpp repo")
-            run_cmd("git", "-C", lcpp_path, "apply", patch_path)
-        else:
-            logging.info("Patch already applied")
-    # using cmake here as llama.cpp switched to it completely for new versions
-    if os.name == "nt":
-        bin_path = os.path.join(lcpp_path, "build", "bin", "debug", "llama-quantize.exe")
-    else:
-        bin_path = os.path.join(lcpp_path, "build", "bin", "llama-quantize")
-    if not os.path.isfile(bin_path) or force_update or need_update:
-        if run_cmd("cmake", "--version") != 0:
-            raise RuntimeError("Can't find cmake! Make sure you have a working build environment set up")
-        build_path = os.path.join(lcpp_path, "build")
-        os.makedirs(build_path, exist_ok=True)
-        logging.info("Attempting to build llama.cpp binary from source")
-        run_cmd("cmake", "-B", build_path, lcpp_path)
-        run_cmd("cmake", "--build", build_path, "--config", "Debug", "-j4", "--target", "llama-quantize")
-        if not os.path.isfile(bin_path):
-            raise RuntimeError("Build failed! Rerun with --debug to see error log.")
-    else:
-        logging.info("Binary already present")
-    return bin_path
-def get_output_dir():
-    root = os.path.dirname(os.path.abspath(__file__))
-    root = os.path.normpath(root)
-    split = os.path.split(root)
-    while split[1]:
-        if split[1] == "ComfyUI":
-            if os.path.isdir(os.path.join(*split, "models", "unet")): # new
-                root = os.path.join(*split, "models", "unet", "gguf")
-                logging.info(f"Found ComfyUI, using model folder: {root}")
-                return root
-            if os.path.isdir(os.path.join(*split, "models", "diffusion_models")): # old
-                root = os.path.join(*split, "models", "diffusion_models", "gguf")
-                logging.info(f"Found ComfyUI, using model folder: {root}")
-                return root
-            logging.info("Found ComfyUI, but can't find model folder")
-            break
-        split = os.path.split(split[0])
-    root = os.path.join(root, "models")
-    logging.info(f"Defaulting to [script dir]/models: {root}")
-    return root
-def get_hf_fake_sd(repo, path, device=torch.device("meta")):
-    sd = {}
-    meta = hf.parse_safetensors_file_metadata(repo, path)
-    for key, raw in meta.tensors.items():
-        shape = tuple(raw.shape)
-        dtype = dtype_dict.get(raw.dtype, torch.float32)
-        sd[key] = torch.zeros(shape, dtype=dtype, device=device)
-    return sd
-def get_hf_file_arch(repo, path):
-    pattern = r'(\d+)-of-(\d+)'
-    match = re.search(pattern, path)
-    if match:
-        # we need to load it as multipart
-        if int(match.group(1)) != 1:
-            return None
-        sd = {}
-        for k in range(int(match.group(2))):
-            shard_path = path.replace(match.group(1), f"{k+1:0{len(match.group(1))}}")
-            sd.update(get_hf_fake_sd(repo, shard_path))
-    else:
-        sd = get_hf_fake_sd(repo, path)
-    # this should raise an error on failure
-    sd = strip_prefix(sd)
-    model_arch = detect_arch(sd)
-    # this is for SDXL and SD1.5, I want to overhaul this logic to match sd.cpp eventually
-    assert not model_arch.shape_fix, "Model uses shape fix (SDXL/SD1) - unsupported for now."
-    return model_arch.arch
-def get_hf_valid_files(repo):
-    # TODO: probably tweak this?
-    MIN_SIZE_GB = 1
-    VALID_SRC_EXTS = [".safetensors", ] # ".pt", ".ckpt", ]
-    meta = hf.model_info(repo, files_metadata=True)
-    valid = {}
-    for file in meta.siblings:
-        path = file.rfilename
-        fname = os.path.basename(path)
-        name, ext = os.path.splitext(fname)
-        if ext.lower() not in VALID_SRC_EXTS:
-            logging.debug(f"Invalid ext: {path} {ext}")
-            continue
-        if file.size / (1024 ** 3) < MIN_SIZE_GB:
-            logging.debug(f"File too small: {path} {file.size}")
-            continue
-        try:
-            arch = get_hf_file_arch(repo, path)
-        except Exception as e:
-            logging.warning(f"Arch detect fail: {e} ({path})")
-        else:
-            if arch is not None:
-                valid[path] = arch
-                logging.info(f"Found '{arch}' model at path {path}")
-    return valid
-def make_base_quant(src, output_dir, temp_dir, final=True, resume=True):
-    name, ext = os.path.splitext(os.path.basename(src))
-    if ext == ".gguf":
-        logging.info("Input file already in gguf, assuming base quant")
-        return None, src, None
-    name = name.lower() # uncomment to preserve case in all quants
-    dst_tmp = os.path.join(temp_dir, f"{name}-{{ftype}}.gguf") # ftype is filled in by convert.py
-    tmp_path, model_arch, fix_path = convert_file(src, dst_tmp, interact=False, overwrite=False)
-    dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
-    if os.path.isfile(dst_path):
-        if resume:
-            logging.warning("Resuming with interrupted base quant, may be incorrect!")
-            return dst_path, tmp_path, fix_path
-        raise OSError(f"Output already exists! Clear folder? {dst_path}")
-    if fix_path is not None and os.path.isfile(fix_path):
-        quant_source = tmp_path
-        if final:
-            apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
-        else:
-            dst_path = None
-    else:
-        fix_path = None
-        if final:
-            os.rename(tmp_path, dst_path)
-            quant_source = dst_path
-        else:
-            dst_path = None
-            quant_source = tmp_path
-    return dst_path, quant_source, fix_path
-def make_quant(src, output_dir, temp_dir, qtype, quantize_binary, fix_path=None, resume=True):
-    name, ext = os.path.splitext(os.path.basename(src))
-    assert ext.lower() == ".gguf", "Invalid input file"
-    src_qtext = [x for x in ["-F32.gguf", "-F16.gguf", "-BF16.gguf"] if x in src]
-    if len(src_qtext) == 1:
-        tmp_path = os.path.join(
-            temp_dir,
-            os.path.basename(src).replace(src_qtext[0], f"-{qtype.upper()}.gguf")
-        )
-    else:
-        tmp_path = os.path.join(
-            temp_dir,
-            f"{name}-{qtype.upper()}.gguf"
-        )
-    tmp_path = os.path.abspath(tmp_path)
-    dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
-    if os.path.isfile(dst_path):
-        if resume:
-            return dst_path
-        raise OSError("Output already exists! Clear folder?")
-    r = run_cmd(quantize_binary, src, tmp_path, qtype, log_error=True)
-    time.sleep(2) # leave time for file sync?
-    if r != 0:
-        raise RuntimeError(f"Quantization failed with error code {r}")
-    if fix_path is not None:
-        apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
-        if os.path.isfile(dst_path) and os.path.isfile(tmp_path):
-            os.remove(tmp_path)
-    else:
-        os.rename(tmp_path, dst_path)
-    return dst_path
-if __name__ == "__main__":
-    args = get_args()
-    os.makedirs(args.output_dir, exist_ok=True)
-    os.makedirs(args.temp_dir, exist_ok=True)
-    quantize_binary = setup_utils(args.force_update)
-    try:
-        from convert import detect_arch, strip_prefix, convert_file
-        from fix_5d_tensors import apply_5d_fix
-    except [ImportError, ModuleNotFoundError] as e:
-        raise ImportError(f"Can't import required utils: {e}")
-    if not os.path.isfile(args.src):
-        # huggingface repo. TODO: file choice
-        if len(args.src.split("/")) != "1":
-            raise OSError(f"Invalid huggingface repo or model path {args.src}")
-        raise NotImplementedError("HF not yet supported")
-        # download then set to temp file
-        # hf_repo = "Lightricks/LTX-Video" # "fal/AuraFlow-v0.3"
-        # get_hf_valid_files(hf_repo)
-        # args.src = ...
-    out_files = []
-    base_quant, quant_source, fix_path = make_base_quant(
-        args.src,
-        args.output_dir,
-        args.temp_dir,
-        final=("base" in args.quants),
-        resume=args.resume,
-    )
-    if "base" in args.quants:
-        args.quants = [x for x in args.quants if x not in ["base"]]
-    if base_quant is not None:
-        out_files.append(base_quant)
-    for qtype in args.quants:
-        out_files.append(make_quant(
-            quant_source,
-            args.output_dir,
-            args.temp_dir,
-            qtype,
-            quantize_binary,
-            fix_path,
-            resume=args.resume,
-        ))
-    if fix_path is not None and os.path.isfile(fix_path):
-        os.remove(fix_path)
-    if base_quant != quant_source:
-        # make sure our quant source is in the temp folder before removing it
-        cc = os.path.commonpath([os.path.normpath(quant_source), os.path.normpath(args.temp_dir)])
-        if cc == os.path.normpath(args.temp_dir):
-            os.remove(quant_source)
-    out_file_str = '\n'.join(out_files)
-    logging.info(f"Output file(s): {out_file_str}")