Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +3 -0
GGUF/HuMo_Q4_K_M.gguf +3 -0
GGUF/HuMo_Q6_K.gguf +3 -0
GGUF/HuMo_Q8_0.gguf +3 -0
GGUF/README.md +93 -0
GGUF/convert.py +412 -0
GGUF/fix_5d_tensors.py +85 -0
GGUF/fix_lines_ending.py +31 -0
GGUF/lcpp.patch +499 -0
GGUF/llama.cpp +0 -0
GGUF/read_tensors.py +21 -0
GGUF/tool_auto.py +374 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+GGUF/HuMo_Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+GGUF/HuMo_Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
+GGUF/HuMo_Q8_0.gguf filter=lfs diff=lfs merge=lfs -text

GGUF/HuMo_Q4_K_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:674ad854622dd8f584541a0c740b06e66cc7bd89cde2bdc05a4a923b7f119932
+size 11454319232

GGUF/HuMo_Q6_K.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff7248b8b1019d02e8cee372186230408a117a786a7d603014aaba5914379079
+size 14438085248

GGUF/HuMo_Q8_0.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe9d4d32e7b1813de58c87cb87c036a64f542122b9aeeb2a5cb98c242d1c61d5
+size 18510776960

GGUF/README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+## Converting initial model
+To convert your initial safetensors/ckpt model to FP16/BF16 GGUF, run the following command:
+```
+python convert.py --src E:\models\unet\flux1-dev.safetensors
+```
+Make sure `gguf>=0.13.0` is installed for this step. Optionally, specify the output gguf file with the `--dst` arg.
+> [!NOTE]
+> Do not use the diffusers UNET format for flux, it won't work, use the default/reference checkpoint key format. This is due to q/k/v being merged into one qkv key.
+> You can convert it by loading it in ComfyUI and saving it using the built-in "ModelSave" node.
+> [!WARNING]
+> For hunyuan video/wan 2.1, you will see a warning about 5D tensors. This means the script will save a **non functional** model to disk first, that you can quantize. I recommend saving these in a separate `raw` folder to avoid confusion.
+>
+> After quantization, you will have to run `fix_5d_tensor.py` manually to add back the missing key that was saved by the conversion code.
+## Quantizing using custom llama.cpp
+Depending on your git settings, you may need to run the following script first in order to make sure the patch file is valid. It will convert Windows (CRLF) line endings to Unix (LF) ones.
+```
+python fix_lines_ending.py
+```
+Git clone llama.cpp into the current folder:
+```
+git clone https://github.com/ggerganov/llama.cpp
+```
+Check out the correct branch, then apply the custom patch needed to add image model support to the repo you just cloned.
+```
+cd llama.cpp
+git checkout tags/b3962
+git apply ..\lcpp.patch
+```
+Compile the llama-quantize binary. This example uses cmake, on linux you can just use make.
+### Visual Studio 2019, Linux, etc...
+```
+mkdir build
+cmake -B build
+cmake --build build --config Debug -j10 --target llama-quantize
+cd ..
+```
+### Visual Studio 2022
+```
+mkdir build
+cmake -B build -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=ON -DCMAKE_CXX_FLAGS="-std=c++17"
+```
+Edit the `llama.cpp\common\log.cpp` file, inserts two lines after the existing first line:
+```
+#include "log.h"
+#define _SILENCE_CXX23_CHRONO_DEPRECATION_WARNING
+#include <chrono>
+```
+Then you can build the project:
+```
+cmake --build build --config Debug -j10 --target llama-quantize
+cd ..
+```
+### Quantize your model
+Now you can use the newly build binary to quantize your model to the desired format:
+```
+llama.cpp\build\bin\Debug\llama-quantize.exe E:\models\unet\flux1-dev-BF16.gguf E:\models\unet\flux1-dev-Q4_K_S.gguf Q4_K_S
+```
+You can extract the patch again with `git diff src\llama.cpp > lcpp.patch` if you wish to change something and contribute back.
+> [!WARNING]
+> For hunyuan video/wan 2.1, you will have to run `fix_5d_tensor.py` after the quantization step is done.
+>
+> Example usage:  `fix_5d_tensors.py --src E:\models\video\raw\wan2.1-t2v-1.3b-Q8_0.gguf --dst E:\models\video\wan2.1-t2v-1.3b-Q8_0.gguf`
+>
+> By default, this also saves a `fix_5d_tensors_[arch].safetensors` file in the `ComfyUI-GGUF/tools` folder, it's recommended to delete this after all models have been converted.
+> [!NOTE]
+> Do not quantize SDXL / SD1 / other Conv2D heavy models. If you do, make sure to **extract the UNET model first**.
+>This should be obvious, but also don't use the resulting llama-quantize binary with LLMs.

GGUF/convert.py ADDED Viewed

	@@ -0,0 +1,412 @@

+# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
+import os
+import gguf
+import torch
+import logging
+import argparse
+from tqdm import tqdm
+from safetensors.torch import load_file, save_file
+QUANTIZATION_THRESHOLD = 1024
+REARRANGE_THRESHOLD = 512
+MAX_TENSOR_NAME_LENGTH = 127
+MAX_TENSOR_DIMS = 4
+class ModelTemplate:
+    arch = "invalid"  # string describing architecture
+    shape_fix = False # whether to reshape tensors
+    ndims_fix = False # whether to save fix file for tensors exceeding max dims
+    keys_detect = []  # list of lists to match in state dict
+    keys_banned = []  # list of keys that should mark model as invalid for conversion
+    keys_hiprec = []  # list of keys that need to be kept in fp32 for some reason
+    keys_ignore = []  # list of strings to ignore keys by when found
+class ModelFlux(ModelTemplate):
+    arch = "flux"
+    keys_detect = [
+        ("single_transformer_blocks.0.attn.norm_k.weight",),
+        ("double_blocks.0.img_attn.proj.weight",),
+    ]
+    keys_banned = ["single_transformer_blocks.0.attn.norm_k.weight",]
+class ModelSD3(ModelTemplate):
+    arch = "sd3"
+    keys_detect = [
+        ("transformer_blocks.0.ff_context.net.0.proj.weight",),
+        ("joint_blocks.0.x_block.attn.qkv.weight",),
+    ]
+    keys_banned = ["transformer_blocks.0.ff_context.net.0.proj.weight",]
+class ModelAura(ModelTemplate):
+    arch = "aura"
+    keys_detect = [
+        ("double_layers.3.modX.1.weight",),
+        ("joint_transformer_blocks.3.ff_context.out_projection.weight",),
+    ]
+    keys_banned = ["joint_transformer_blocks.3.ff_context.out_projection.weight",]
+class ModelHiDream(ModelTemplate):
+    arch = "hidream"
+    keys_detect = [
+        (
+            "caption_projection.0.linear.weight",
+            "double_stream_blocks.0.block.ff_i.shared_experts.w3.weight"
+        )
+    ]
+    keys_hiprec = [
+        # nn.parameter, can't load from BF16 ver
+        ".ff_i.gate.weight",
+        "img_emb.emb_pos"
+    ]
+class ModelCosmosPredict2(ModelTemplate):
+    arch = "cosmos"
+    keys_detect = [
+        (
+            "blocks.0.mlp.layer1.weight",
+            "blocks.0.adaln_modulation_cross_attn.1.weight",
+        )
+    ]
+    keys_hiprec = ["pos_embedder"]
+    keys_ignore = ["_extra_state", "accum_"]
+class ModelQwenImage(ModelTemplate):
+    arch = "qwen_image"
+    keys_detect = [
+        (
+            "time_text_embed.timestep_embedder.linear_2.weight",
+            "transformer_blocks.0.attn.norm_added_q.weight",
+            "transformer_blocks.0.img_mlp.net.0.proj.weight",
+        )
+    ]
+class ModelHyVid(ModelTemplate):
+    arch = "hyvid"
+    ndims_fix = True
+    keys_detect = [
+        (
+            "double_blocks.0.img_attn_proj.weight",
+            "txt_in.individual_token_refiner.blocks.1.self_attn_qkv.weight",
+        )
+    ]
+class ModelWan(ModelTemplate):
+    arch = "wan"
+    ndims_fix = True
+    keys_detect = [
+        (
+            "blocks.0.self_attn.norm_q.weight",
+            "text_embedding.2.weight",
+            "head.modulation",
+        )
+    ]
+    keys_hiprec = [
+        ".modulation", # nn.parameter, can't load from BF16 ver
+        ".encoder.padding_tokens", # nn.parameter, specific to S2V
+        "trainable_cond_mask", # used directly w/ .weight
+        "casual_audio_encoder.weights", # nn.parameter, specific to S2V
+        "casual_audio_encoder.encoder.conv", # CausalConv1d doesn't use ops.py for now
+    ]
+class ModelLTXV(ModelTemplate):
+    arch = "ltxv"
+    keys_detect = [
+        (
+            "adaln_single.emb.timestep_embedder.linear_2.weight",
+            "transformer_blocks.27.scale_shift_table",
+            "caption_projection.linear_2.weight",
+        )
+    ]
+    keys_hiprec = [
+        "scale_shift_table" # nn.parameter, can't load from BF16 base quant
+    ]
+class ModelSDXL(ModelTemplate):
+    arch = "sdxl"
+    shape_fix = True
+    keys_detect = [
+        ("down_blocks.0.downsamplers.0.conv.weight", "add_embedding.linear_1.weight",),
+        (
+            "input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight",
+            "output_blocks.2.2.conv.weight", "output_blocks.5.2.conv.weight",
+        ), # Non-diffusers
+        ("label_emb.0.0.weight",),
+    ]
+class ModelSD1(ModelTemplate):
+    arch = "sd1"
+    shape_fix = True
+    keys_detect = [
+        ("down_blocks.0.downsamplers.0.conv.weight",),
+        (
+            "input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight", "input_blocks.9.0.op.weight",
+            "output_blocks.2.1.conv.weight", "output_blocks.5.2.conv.weight", "output_blocks.8.2.conv.weight"
+        ), # Non-diffusers
+    ]
+class ModelLumina2(ModelTemplate):
+    arch = "lumina2"
+    keys_detect = [
+        ("cap_embedder.1.weight", "context_refiner.0.attention.qkv.weight")
+    ]
+class ModelHuMo(ModelTemplate):
+    arch = "humo"
+    ndims_fix = True
+    keys_detect = [
+        ("blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",),
+        ("audio_proj.audio_proj_glob_1.layer.weight",),
+        (
+            "blocks.39.audio_cross_attn_wrapper.norm1_audio.weight",
+            "blocks.0.self_attn.norm_q.weight",
+            "text_embedding.2.weight",
+            "head.modulation"
+        ),
+    ]
+    keys_hiprec = ["patch_embedding", "text_embedding", "time_embedding", ".modulation"]
+# The architectures are checked in order and the first successful match terminates the search.
+arch_list = [
+    ModelFlux, ModelSD3, ModelAura, ModelHiDream, ModelCosmosPredict2, ModelQwenImage,
+    ModelLTXV, ModelHyVid, ModelHuMo, ModelWan, ModelSDXL, ModelSD1, ModelLumina2
+]
+def is_model_arch(model, state_dict):
+    # check if model is correct
+    matched = False
+    invalid = False
+    # print(state_dict)
+    for match_list in model.keys_detect:
+        if all(key in state_dict for key in match_list):
+            matched = True
+            invalid = any(key in state_dict for key in model.keys_banned)
+            break
+    assert not invalid, f"Model architecture not allowed for conversion! (i.e. reference VS diffusers format) [arch:{model.arch}]"
+    return matched
+def detect_arch(state_dict):
+    model_arch = None
+    for arch in arch_list:
+        if is_model_arch(arch, state_dict):
+            model_arch = arch()
+            break
+    assert model_arch is not None, "Unknown model architecture!"
+    return model_arch
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
+    parser.add_argument("--src", required=True, help="Source model ckpt file.")
+    parser.add_argument("--dst", help="Output unet gguf file.")
+    args = parser.parse_args()
+    if not os.path.isfile(args.src):
+        parser.error("No input provided!")
+    return args
+def strip_prefix(state_dict):
+    # prefix for mixed state dict
+    prefix = None
+    for pfx in ["model.diffusion_model.", "model."]:
+        if any([x.startswith(pfx) for x in state_dict.keys()]):
+            prefix = pfx
+            break
+    # prefix for uniform state dict
+    if prefix is None:
+        for pfx in ["net."]:
+            if all([x.startswith(pfx) for x in state_dict.keys()]):
+                prefix = pfx
+                break
+    # strip prefix if found
+    if prefix is not None:
+        logging.info(f"State dict prefix found: '{prefix}'")
+        sd = {}
+        for k, v in state_dict.items():
+            if prefix not in k:
+                continue
+            k = k.replace(prefix, "")
+            sd[k] = v
+    else:
+        logging.debug("State dict has no prefix")
+        sd = state_dict
+    return sd
+def find_main_dtype(state_dict, allow_fp32=False):
+    # detect most common dtype in input
+    dtypes = [x.dtype for x in state_dict.values()]
+    dtypes = {x:dtypes.count(x) for x in set(dtypes)}
+    main_dtype = max(dtypes, key=dtypes.get)
+    if main_dtype == torch.bfloat16:
+        ftype_name = "BF16"
+        ftype_gguf = gguf.LlamaFileType.MOSTLY_BF16
+    elif main_dtype == torch.float32 and allow_fp32:
+        ftype_name = "F32"
+        ftype_gguf = gguf.LlamaFileType.ALL_F32
+    else:
+        ftype_name = "F16"
+        ftype_gguf = gguf.LlamaFileType.MOSTLY_F16
+    return ftype_name, ftype_gguf
+def load_state_dict(path):
+    if any(path.endswith(x) for x in [".ckpt", ".pt", ".bin", ".pth"]):
+        state_dict = torch.load(path, map_location="cpu", weights_only=True)
+        for subkey in ["model", "module"]:
+            if subkey in state_dict:
+                state_dict = state_dict[subkey]
+                break
+        if len(state_dict) < 20:
+            raise RuntimeError(f"pt subkey load failed: {state_dict.keys()}")
+    else:
+        state_dict = load_file(path)
+    return strip_prefix(state_dict)
+def handle_tensors(writer, state_dict, model_arch, allow_fp32=False):
+    name_lengths = tuple(sorted(
+        ((key, len(key)) for key in state_dict.keys()),
+        key=lambda item: item[1],
+        reverse=True,
+    ))
+    if not name_lengths:
+        return
+    max_name_len = name_lengths[0][1]
+    if max_name_len > MAX_TENSOR_NAME_LENGTH:
+        bad_list = ", ".join(f"{key!r} ({namelen})" for key, namelen in name_lengths if namelen > MAX_TENSOR_NAME_LENGTH)
+        raise ValueError(f"Can only handle tensor names up to {MAX_TENSOR_NAME_LENGTH} characters. Tensors exceeding the limit: {bad_list}")
+    invalid_tensors = {}
+    quantized_tensors = {}
+    for key, data in tqdm(state_dict.items()):
+        old_dtype = data.dtype
+        if any(x in key for x in model_arch.keys_ignore):
+            tqdm.write(f"Filtering ignored key: '{key}'")
+            continue
+        if data.dtype == torch.bfloat16:
+            data = data.to(torch.float32).numpy()
+        # this is so we don't break torch 2.0.X
+        elif data.dtype in [getattr(torch, "float8_e4m3fn", "_invalid"), getattr(torch, "float8_e5m2", "_invalid")]:
+            data = data.to(torch.float16).numpy()
+        else:
+            data = data.numpy()
+        n_dims = len(data.shape)
+        data_shape = data.shape
+        if old_dtype == torch.bfloat16:
+            data_qtype = gguf.GGMLQuantizationType.BF16
+        elif old_dtype == torch.float32 and allow_fp32:
+            data_qtype = gguf.GGMLQuantizationType.F32
+        else:
+            data_qtype = gguf.GGMLQuantizationType.F16
+        # The max no. of dimensions that can be handled by the quantization code is 4
+        if len(data.shape) > MAX_TENSOR_DIMS:
+            invalid_tensors[key] = data
+            continue # needs to be added back later
+        # get number of parameters (AKA elements) in this tensor
+        n_params = 1
+        for dim_size in data_shape:
+            n_params *= dim_size
+        if old_dtype in (torch.float32, torch.bfloat16):
+            if n_dims == 1:
+                # one-dimensional tensors should be kept in F32
+                # also speeds up inference due to not dequantizing
+                data_qtype = gguf.GGMLQuantizationType.F32
+            elif n_params <= QUANTIZATION_THRESHOLD:
+                # very small tensors
+                data_qtype = gguf.GGMLQuantizationType.F32
+            elif any(x in key for x in model_arch.keys_hiprec):
+                # tensors that require max precision
+                data_qtype = gguf.GGMLQuantizationType.F32
+        if (model_arch.shape_fix                        # NEVER reshape for models such as flux
+            and n_dims > 1                              # Skip one-dimensional tensors
+            and n_params >= REARRANGE_THRESHOLD         # Only rearrange tensors meeting the size requirement
+            and (n_params / 256).is_integer()           # Rearranging only makes sense if total elements is divisible by 256
+            and not (data.shape[-1] / 256).is_integer() # Only need to rearrange if the last dimension is not divisible by 256
+        ):
+            orig_shape = data.shape
+            data = data.reshape(n_params // 256, 256)
+            writer.add_array(f"comfy.gguf.orig_shape.{key}", tuple(int(dim) for dim in orig_shape))
+        try:
+            data = gguf.quants.quantize(data, data_qtype)
+            quantized_tensors[key] = data_qtype
+        except (AttributeError, gguf.QuantError) as e:
+            tqdm.write(f"falling back to F16: {e}")
+            data_qtype = gguf.GGMLQuantizationType.F16
+            data = gguf.quants.quantize(data, data_qtype)
+            quantized_tensors[key] = data_qtype
+        shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+        tqdm.write(f"{f'%-{max_name_len + 4}s' % f'{key}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
+        writer.add_tensor(key, data, raw_dtype=data_qtype)
+    return quantized_tensors, invalid_tensors
+def convert_file(path, dst_path=None, interact=True, overwrite=False, allow_fp32=False):
+    # load & run model detection logic
+    state_dict = load_state_dict(path)
+    model_arch = detect_arch(state_dict)
+    logging.info(f"* Architecture detected from input: {model_arch.arch}")
+    ftype_name, ftype_gguf = find_main_dtype(state_dict, allow_fp32=allow_fp32)
+    if dst_path is None:
+        dst_path = f"{os.path.splitext(path)[0]}-{ftype_name}.gguf"
+    elif "{ftype}" in dst_path: # lcpp logic
+        dst_path = dst_path.replace("{ftype}", ftype_name)
+    if os.path.isfile(dst_path) and not overwrite:
+        if interact:
+            input("Output exists enter to continue or ctrl+c to abort!")
+        else:
+            raise OSError("Output exists and overwriting is disabled!")
+    # handle actual file
+    writer = gguf.GGUFWriter(path=None, arch=model_arch.arch)
+    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    if ftype_gguf is not None:
+        writer.add_file_type(ftype_gguf)
+    quantized_tensors, invalid_tensors = handle_tensors(writer, state_dict, model_arch, allow_fp32=allow_fp32)
+    if len(invalid_tensors) > 0:
+        if not model_arch.ndims_fix: # only applies to 5D fix for now, possibly expand to cover more cases?
+            raise ValueError(f"Tensor(s) detected that exceeds dims supported by C++ code! ({invalid_tensors.keys()})")
+        fix_path = os.path.join(
+            os.path.dirname(dst_path),
+            f"fix_5d_tensors_{model_arch.arch}.safetensors"
+        )
+        if os.path.isfile(fix_path):
+            raise RuntimeError(f"Tensor fix file already exists! {path}")
+        invalid_tensors = {k:torch.from_numpy(v.copy()) for k,v in invalid_tensors.items()}
+        save_file(invalid_tensors, fix_path)
+        logging.warning(f"\n### Warning! Fix file found at '{fix_path}'")
+        logging.warning(" you most likely need to run 'fix_5d_tensors.py' after quantization.")
+    else:
+        fix_path = None
+    writer.write_header_to_file(path=dst_path)
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file(progress=True)
+    writer.close()
+    return dst_path, model_arch, fix_path
+if __name__ == "__main__":
+    args = parse_args()
+    convert_file(args.src, args.dst)

GGUF/fix_5d_tensors.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
+import os
+import gguf
+import torch
+import argparse
+from tqdm import tqdm
+from safetensors.torch import load_file
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", required=True)
+    parser.add_argument("--dst", required=True)
+    parser.add_argument("--fix", required=False, help="Defaults to ./fix_5d_tensors_[arch].pt")
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+    if not os.path.isfile(args.src):
+        parser.error(f"Invalid source file '{args.src}'")
+    if not args.overwrite and os.path.exists(args.dst):
+        parser.error(f"Output exists, use '--overwrite' ({args.dst})")
+    return args
+def get_arch_str(reader):
+    field = reader.get_field("general.architecture")
+    return str(field.parts[field.data[-1]], encoding="utf-8")
+def get_file_type(reader):
+    field = reader.get_field("general.file_type")
+    ft = int(field.parts[field.data[-1]])
+    return gguf.LlamaFileType(ft)
+def apply_5d_fix(src, dst, fix=None, overwrite=False):
+    # read existing
+    reader = gguf.GGUFReader(src)
+    arch = get_arch_str(reader)
+    file_type = get_file_type(reader)
+    print(f"Detected arch: '{arch}' (ftype: {str(file_type)})")
+    # prep fix
+    if fix is None:
+        fix = f"./fix_5d_tensors_{arch}.safetensors"
+    if not os.path.isfile(fix):
+        raise OSError(f"No 5D tensor fix file: {fix}")
+    sd5d = load_file(fix)
+    sd5d = {k:v.numpy() for k,v in sd5d.items()}
+    print("5D tensors:", sd5d.keys())
+    # prep output
+    writer = gguf.GGUFWriter(path=None, arch=arch)
+    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    writer.add_file_type(file_type)
+    global added
+    added = []
+    def add_extra_key(writer, key, data):
+        global added
+        data_qtype = gguf.GGMLQuantizationType.F32
+        data = gguf.quants.quantize(data, data_qtype)
+        tqdm.write(f"Adding key {key} ({data.shape})")
+        writer.add_tensor(key, data, raw_dtype=data_qtype)
+        added.append(key)
+    # main loop to add missing 5D tensor(s)
+    for tensor in tqdm(reader.tensors):
+        writer.add_tensor(tensor.name, tensor.data, raw_dtype=tensor.tensor_type)
+        key5d = tensor.name.replace(".bias", ".weight")
+        if key5d in sd5d.keys():
+            add_extra_key(writer, key5d, sd5d[key5d])
+    # brute force for any missed
+    for key, data in sd5d.items():
+        if key not in added:
+            add_extra_key(writer, key, data)
+    writer.write_header_to_file(path=dst)
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file(progress=True)
+    writer.close()
+if __name__ == "__main__":
+    args = get_args()
+    apply_5d_fix(args.src, args.dst, fix=args.fix, overwrite=args.overwrite)

GGUF/fix_lines_ending.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+files = ["lcpp.patch", "lcpp_sd3.patch"]
+def has_unix_line_endings(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            content = file.read()
+        return b'\r\n' not in content
+    except Exception as e:
+        print(f"Error checking '{file_path}': {e}")
+        return False
+def convert_to_linux_format(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            content = file.read().replace(b'\r\n', b'\n')
+        with open(file_path, 'wb') as file:
+            file.write(content)
+        print(f"'{file_path}' converted to Linux line endings (LF).")
+    except Exception as e:
+        print(f"Error processing '{file_path}': {e}")
+for file in files:
+    if os.path.exists(file):
+        if has_unix_line_endings(file):
+            print(f"'{file}' already has Unix line endings (LF). No conversion needed.")
+        else:
+            convert_to_linux_format(file)
+    else:
+        print(f"File '{file}' does not exist.")

GGUF/lcpp.patch ADDED Viewed

	@@ -0,0 +1,499 @@

+diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
+index de3c706f..0267c1fa 100644
+--- a/ggml/include/ggml.h
++++ b/ggml/include/ggml.h
+@@ -223,7 +223,7 @@
+ #define GGML_MAX_OP_PARAMS      64
+ #ifndef GGML_MAX_NAME
+-#   define GGML_MAX_NAME        64
++#   define GGML_MAX_NAME        128
+ #endif
+ #define GGML_DEFAULT_N_THREADS  4
+@@ -2449,6 +2449,7 @@ extern "C" {
+     // manage tensor info
+     GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
++    GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
+     GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+     GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index b16c462f..6d1568f1 100644
+--- a/ggml/src/ggml.c
++++ b/ggml/src/ggml.c
+@@ -22960,6 +22960,14 @@ void gguf_add_tensor(
+     ctx->header.n_tensors++;
+ }
++void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
++    const int idx = gguf_find_tensor(ctx, name);
++    if (idx < 0) {
++        GGML_ABORT("tensor not found");
++    }
++    ctx->infos[idx].n_dims = n_dim;
++}
++
+ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
+     const int idx = gguf_find_tensor(ctx, name);
+     if (idx < 0) {
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 24e1f1f0..8a1e9ef8 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -205,6 +205,18 @@ enum llm_arch {
+     LLM_ARCH_GRANITE,
+     LLM_ARCH_GRANITE_MOE,
+     LLM_ARCH_CHAMELEON,
++    LLM_ARCH_FLUX,
++    LLM_ARCH_SD1,
++    LLM_ARCH_SDXL,
++    LLM_ARCH_SD3,
++    LLM_ARCH_AURA,
++    LLM_ARCH_LTXV,
++    LLM_ARCH_HYVID,
++    LLM_ARCH_WAN,
++    LLM_ARCH_HIDREAM,
++    LLM_ARCH_COSMOS,
++    LLM_ARCH_LUMINA2,
++    LLM_ARCH_QWEN_IMAGE,
+     LLM_ARCH_UNKNOWN,
+ };
+@@ -258,6 +270,18 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_GRANITE,         "granite"      },
+     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
+     { LLM_ARCH_CHAMELEON,       "chameleon"    },
++    { LLM_ARCH_FLUX,            "flux"         },
++    { LLM_ARCH_SD1,             "sd1"          },
++    { LLM_ARCH_SDXL,            "sdxl"         },
++    { LLM_ARCH_SD3,             "sd3"          },
++    { LLM_ARCH_AURA,            "aura"         },
++    { LLM_ARCH_LTXV,            "ltxv"         },
++    { LLM_ARCH_HYVID,           "hyvid"        },
++    { LLM_ARCH_WAN,             "wan"          },
++    { LLM_ARCH_HIDREAM,         "hidream"      },
++    { LLM_ARCH_COSMOS,          "cosmos"       },
++    { LLM_ARCH_LUMINA2,         "lumina2"      },
++    { LLM_ARCH_QWEN_IMAGE,      "qwen_image"   },
+     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
+ };
+@@ -1531,6 +1555,18 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+         },
+     },
++    { LLM_ARCH_FLUX,       {}},
++    { LLM_ARCH_SD1,        {}},
++    { LLM_ARCH_SDXL,       {}},
++    { LLM_ARCH_SD3,        {}},
++    { LLM_ARCH_AURA,       {}},
++    { LLM_ARCH_LTXV,       {}},
++    { LLM_ARCH_HYVID,      {}},
++    { LLM_ARCH_WAN,        {}},
++    { LLM_ARCH_HIDREAM,    {}},
++    { LLM_ARCH_COSMOS,     {}},
++    { LLM_ARCH_LUMINA2,    {}},
++    { LLM_ARCH_QWEN_IMAGE, {}},
+     {
+         LLM_ARCH_UNKNOWN,
+         {
+@@ -5403,6 +5439,26 @@ static void llm_load_hparams(
+     // get general kv
+     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
++    // Disable LLM metadata for image models
++    switch (model.arch) {
++        case LLM_ARCH_FLUX:
++        case LLM_ARCH_SD1:
++        case LLM_ARCH_SDXL:
++        case LLM_ARCH_SD3:
++        case LLM_ARCH_AURA:
++        case LLM_ARCH_LTXV:
++        case LLM_ARCH_HYVID:
++        case LLM_ARCH_WAN:
++        case LLM_ARCH_HIDREAM:
++        case LLM_ARCH_COSMOS:
++        case LLM_ARCH_LUMINA2:
++        case LLM_ARCH_QWEN_IMAGE:
++            model.ftype = ml.ftype;
++            return;
++        default:
++            break;
++    }
++
+     // get hparams kv
+     ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+@@ -18016,6 +18072,158 @@ static void llama_tensor_dequantize_internal(
+     workers.clear();
+ }
++static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
++    // Special function for quantizing image model tensors
++    const std::string name = ggml_get_name(tensor);
++    const llm_arch arch = qs.model.arch;
++
++    // Sanity check
++    if (
++            (name.find("model.diffusion_model.") != std::string::npos) ||
++            (name.find("first_stage_model.") != std::string::npos) ||
++            (name.find("single_transformer_blocks.") != std::string::npos) ||
++            (name.find("joint_transformer_blocks.") != std::string::npos)
++        ) {
++            throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
++    }
++
++    // Unsupported quant types - exclude all IQ quants for now
++    if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  ||
++        ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M  ||
++        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
++        ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
++        ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
++        ftype == LLAMA_FTYPE_MOSTLY_IQ3_M   || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
++        ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
++        throw std::runtime_error("Invalid quantization type for image model (Not supported)");
++    }
++
++    if ( // Rules for to_v attention
++            (name.find("attn_v.weight") != std::string::npos) ||
++            (name.find(".to_v.weight") != std::string::npos) ||
++            (name.find(".v.weight") != std::string::npos) ||
++            (name.find(".attn.w1v.weight") != std::string::npos) ||
++            (name.find(".attn.w2v.weight") != std::string::npos) ||
++            (name.find(".add_v_proj.weight") != std::string::npos) ||
++            (name.find("_attn.v_proj.weight") != std::string::npos)
++        ){
++            if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
++                new_type = GGML_TYPE_Q3_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
++                new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
++                new_type = GGML_TYPE_Q5_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
++                new_type = GGML_TYPE_Q6_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
++                new_type = GGML_TYPE_Q5_K;
++            }
++            ++qs.i_attention_wv;
++    } else if ( // Rules for fused qkv attention
++            (name.find("attn_qkv.weight") != std::string::npos) ||
++            (name.find("attn.qkv.weight") != std::string::npos) ||
++            (name.find("attention.qkv.weight") != std::string::npos)
++        ) {
++            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
++                new_type = GGML_TYPE_Q4_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
++                new_type = GGML_TYPE_Q5_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
++                new_type = GGML_TYPE_Q6_K;
++            }
++    } else if ( // Rules for ffn
++            (name.find("ffn_down") != std::string::npos) ||
++            ((name.find("experts.") != std::string::npos) && (name.find(".w2.weight") != std::string::npos)) ||
++            (name.find(".ffn.2.weight") != std::string::npos) || // is this even the right way around?
++            (name.find(".ff.net.2.weight") != std::string::npos) ||
++            (name.find(".mlp.layer2.weight") != std::string::npos) ||
++            (name.find(".adaln_modulation_mlp.2.weight") != std::string::npos) ||
++            (name.find(".feed_forward.w2.weight") != std::string::npos) ||
++            (name.find(".img_mlp.net.2.weight") != std::string::npos) ||
++            (name.find(".txt_mlp.net.2.weight") != std::string::npos)
++        ) {
++            // TODO: add back `layer_info` with some model specific logic + logic further down
++            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
++                new_type = GGML_TYPE_Q4_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
++                new_type = GGML_TYPE_Q5_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
++                new_type = GGML_TYPE_Q5_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
++                new_type = GGML_TYPE_Q6_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
++                new_type = GGML_TYPE_Q6_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
++                new_type = GGML_TYPE_Q4_1;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
++                new_type = GGML_TYPE_Q5_1;
++            }
++            ++qs.i_ffn_down;
++    }
++
++    // first/last block high precision test
++    if (arch == LLM_ARCH_QWEN_IMAGE){
++        if (
++            (name.find("transformer_blocks.0.") != std::string::npos) ||
++            (name.find("transformer_blocks.59.") != std::string::npos) // this should be dynamic
++        ) {
++            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
++                new_type = GGML_TYPE_Q4_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
++                new_type = GGML_TYPE_Q4_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
++                new_type = GGML_TYPE_Q5_K;
++            }
++            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
++                new_type = GGML_TYPE_Q6_K;
++            }
++        }
++    }
++
++    // Sanity check for row shape
++    bool convert_incompatible_tensor = false;
++    if (new_type == GGML_TYPE_Q2_K    || new_type == GGML_TYPE_Q3_K    || new_type == GGML_TYPE_Q4_K   ||
++        new_type == GGML_TYPE_Q5_K    || new_type == GGML_TYPE_Q6_K) {
++        int nx = tensor->ne[0];
++        int ny = tensor->ne[1];
++        if (nx % QK_K != 0) {
++            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
++            convert_incompatible_tensor = true;
++        } else {
++            ++qs.n_k_quantized;
++        }
++    }
++    if (convert_incompatible_tensor) {
++        // TODO: Possibly reenable this in the future
++        // switch (new_type) {
++        //     case GGML_TYPE_Q2_K:
++        //     case GGML_TYPE_Q3_K:
++        //     case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
++        //     case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
++        //     case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
++        //     default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
++        // }
++        new_type = GGML_TYPE_F16;
++        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++        ++qs.n_fallback;
++    }
++    return new_type;
++}
++
+ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+     const std::string name = ggml_get_name(tensor);
+@@ -18513,7 +18721,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+         if (llama_model_has_encoder(&model)) {
+             n_attn_layer *= 3;
+         }
+-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
++        if (model.arch != LLM_ARCH_HYVID) { // TODO: Check why this fails
++            GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
++        }
+     }
+     size_t total_size_org = 0;
+@@ -18547,6 +18757,57 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+             ctx_outs[i_split] = gguf_init_empty();
+         }
+         gguf_add_tensor(ctx_outs[i_split], tensor);
++        // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
++        if (model.arch == LLM_ARCH_SD3) {
++            const std::string name = ggml_get_name(tensor);
++            if (name == "pos_embed" && tensor->ne[2] == 1) {
++                const int n_dim = 3;
++                gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
++                LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
++            }
++        }
++        // same goes for auraflow
++        if (model.arch == LLM_ARCH_AURA) {
++            const std::string name = ggml_get_name(tensor);
++            if (name == "positional_encoding" && tensor->ne[2] == 1) {
++                const int n_dim = 3;
++                gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
++                LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
++            }
++            if (name == "register_tokens" && tensor->ne[2] == 1) {
++                const int n_dim = 3;
++                gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
++                LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
++            }
++        }
++        // conv3d fails due to max dims - unsure what to do here as we never even reach this check
++        if (model.arch == LLM_ARCH_HYVID) {
++            const std::string name = ggml_get_name(tensor);
++            if (name == "img_in.proj.weight" && tensor->ne[5] != 1 ) {
++                throw std::runtime_error("img_in.proj.weight size failed for HyVid");
++            }
++        }
++        // All the modulation layers also have dim1, and I think conv3d fails here too but we segfaul way before that...
++        if (model.arch == LLM_ARCH_WAN) {
++            const std::string name = ggml_get_name(tensor);
++            if (name.find(".modulation") != std::string::npos && tensor->ne[2] == 1) {
++                const int n_dim = 3;
++                gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
++                LLAMA_LOG_INFO("\n%s: Correcting shape for Wan: [key:%s]\n", __func__, tensor->name);
++            }
++            // FLF2V model only
++            if (name == "img_emb.emb_pos") {
++                const int n_dim = 3;
++                gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
++                LLAMA_LOG_INFO("\n%s: Correcting shape for Wan FLF2V: [key:%s]\n", __func__, tensor->name);
++            }
++            // S2V model only
++            if (name == "casual_audio_encoder.weights" || name == "casual_audio_encoder.encoder.padding_tokens") {
++                const int n_dim = 4;
++                gguf_set_tensor_ndim(ctx_outs[i_split], tensor->name, n_dim);
++                LLAMA_LOG_INFO("\n%s: Correcting shape for Wan S2V: [key:%s]\n", __func__, tensor->name);
++            }
++        }
+     }
+     // Set split info if needed
+@@ -18647,6 +18908,124 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+         // do not quantize relative position bias (T5)
+         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
++        // rules for image models
++        bool image_model = false;
++        if (model.arch == LLM_ARCH_FLUX) {
++            image_model = true;
++            quantize &= name.find("txt_in.") == std::string::npos;
++            quantize &= name.find("img_in.") == std::string::npos;
++            quantize &= name.find("time_in.") == std::string::npos;
++            quantize &= name.find("vector_in.") == std::string::npos;
++            quantize &= name.find("guidance_in.") == std::string::npos;
++            quantize &= name.find("final_layer.") == std::string::npos;
++        }
++        if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
++            image_model = true;
++            quantize &= name.find("class_embedding.") == std::string::npos;
++            quantize &= name.find("time_embedding.") == std::string::npos;
++            quantize &= name.find("add_embedding.") == std::string::npos;
++            quantize &= name.find("time_embed.") == std::string::npos;
++            quantize &= name.find("label_emb.") == std::string::npos;
++            quantize &= name.find("conv_in.") == std::string::npos;
++            quantize &= name.find("conv_out.") == std::string::npos;
++            quantize &= name != "input_blocks.0.0.weight";
++            quantize &= name != "out.2.weight";
++        }
++        if (model.arch == LLM_ARCH_SD3) {
++            image_model = true;
++            quantize &= name.find("final_layer.") == std::string::npos;
++            quantize &= name.find("time_text_embed.") == std::string::npos;
++            quantize &= name.find("context_embedder.") == std::string::npos;
++            quantize &= name.find("t_embedder.") == std::string::npos;
++            quantize &= name.find("y_embedder.") == std::string::npos;
++            quantize &= name.find("x_embedder.") == std::string::npos;
++            quantize &= name != "proj_out.weight";
++            quantize &= name != "pos_embed";
++        }
++        if (model.arch == LLM_ARCH_AURA) {
++            image_model = true;
++            quantize &= name.find("t_embedder.") == std::string::npos;
++            quantize &= name.find("init_x_linear.") == std::string::npos;
++            quantize &= name != "modF.1.weight";
++            quantize &= name != "cond_seq_linear.weight";
++            quantize &= name != "final_linear.weight";
++            quantize &= name != "final_linear.weight";
++            quantize &= name != "positional_encoding";
++            quantize &= name != "register_tokens";
++        }
++        if (model.arch == LLM_ARCH_LTXV) {
++            image_model = true;
++            quantize &= name.find("adaln_single.") == std::string::npos;
++            quantize &= name.find("caption_projection.") == std::string::npos;
++            quantize &= name.find("patchify_proj.") == std::string::npos;
++            quantize &= name.find("proj_out.") == std::string::npos;
++            quantize &= name.find("scale_shift_table") == std::string::npos; // last block too
++        }
++        if (model.arch == LLM_ARCH_HYVID) {
++            image_model = true;
++            quantize &= name.find("txt_in.") == std::string::npos;
++            quantize &= name.find("img_in.") == std::string::npos;
++            quantize &= name.find("time_in.") == std::string::npos;
++            quantize &= name.find("vector_in.") == std::string::npos;
++            quantize &= name.find("guidance_in.") == std::string::npos;
++            quantize &= name.find("final_layer.") == std::string::npos;
++        }
++        if (model.arch == LLM_ARCH_WAN) {
++            image_model = true;
++            quantize &= name.find("modulation.") == std::string::npos;
++            quantize &= name.find("patch_embedding.") == std::string::npos;
++            quantize &= name.find("text_embedding.") == std::string::npos;
++            quantize &= name.find("time_projection.") == std::string::npos;
++            quantize &= name.find("time_embedding.") == std::string::npos;
++            quantize &= name.find("img_emb.") == std::string::npos;
++            quantize &= name.find("head.") == std::string::npos;
++            // S2V
++            quantize &= name.find("cond_encoder.") == std::string::npos;
++            quantize &= name.find("frame_packer.") == std::string::npos;
++            quantize &= name.find("audio_injector.") == std::string::npos;
++            quantize &= name.find("casual_audio_encoder.") == std::string::npos;
++            quantize &= name.find("trainable_cond_mask.") == std::string::npos;
++        }
++        if (model.arch == LLM_ARCH_HIDREAM) {
++            image_model = true;
++            quantize &= name.find("p_embedder.") == std::string::npos;
++            quantize &= name.find("t_embedder.") == std::string::npos;
++            quantize &= name.find("x_embedder.") == std::string::npos;
++            quantize &= name.find("final_layer.") == std::string::npos;
++            quantize &= name.find(".ff_i.gate.weight") == std::string::npos;
++            quantize &= name.find("caption_projection.") == std::string::npos;
++        }
++        if (model.arch == LLM_ARCH_COSMOS) {
++            image_model = true;
++            quantize &= name.find("p_embedder.") == std::string::npos;
++            quantize &= name.find("t_embedder.") == std::string::npos;
++            quantize &= name.find("t_embedding_norm.") == std::string::npos;
++            quantize &= name.find("x_embedder.") == std::string::npos;
++            quantize &= name.find("pos_embedder.") == std::string::npos;
++            quantize &= name.find("final_layer.") == std::string::npos;
++        }
++        if (model.arch == LLM_ARCH_LUMINA2) {
++            image_model = true;
++            quantize &= name.find("t_embedder.") == std::string::npos;
++            quantize &= name.find("x_embedder.") == std::string::npos;
++            quantize &= name.find("final_layer.") == std::string::npos;
++            quantize &= name.find("cap_embedder.") == std::string::npos;
++            quantize &= name.find("context_refiner.") == std::string::npos;
++            quantize &= name.find("noise_refiner.") == std::string::npos;
++        }
++        if (model.arch == LLM_ARCH_QWEN_IMAGE) {
++            image_model = true;
++            quantize &= name.find("img_in.") == std::string::npos;
++            quantize &= name.find("txt_in.") == std::string::npos;
++            quantize &= name.find("time_text_embed.") == std::string::npos;
++            quantize &= name.find("proj_out.") == std::string::npos;
++            quantize &= name.find("norm_out.") == std::string::npos;
++        }
++        // ignore 3D/4D tensors for image models as the code was never meant to handle these
++        if (image_model) {
++            quantize &= ggml_n_dims(tensor) == 2;
++        }
++
+         enum ggml_type new_type;
+         void * new_data;
+         size_t new_size;
+@@ -18655,6 +19034,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+             new_type = default_type;
+             // get more optimal quantization type based on the tensor shape, layer, etc.
++            if (image_model) {
++                new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
++            } else {
+             if (!params->pure && ggml_is_quantized(default_type)) {
+                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+             }
+@@ -18664,6 +19046,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+                 new_type = params->output_tensor_type;
+             }
++            }
+             // If we've decided to quantize to the same type the tensor is already
+             // in then there's nothing to do.

GGUF/llama.cpp ADDED Viewed

The diff for this file is too large to render. See raw diff

GGUF/read_tensors.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/python3
+import os
+import sys
+import gguf
+def read_tensors(path):
+    reader = gguf.GGUFReader(path)
+    for tensor in reader.tensors:
+        if tensor.tensor_type == gguf.GGMLQuantizationType.F32:
+            continue
+        print(f"{str(tensor.tensor_type):32}: {tensor.name}")
+try:
+    path = sys.argv[1]
+    assert os.path.isfile(path), "Invalid path"
+    print(f"input: {path}")
+except Exception as e:
+    input(f"failed: {e}")
+else:
+    read_tensors(path)
+    input()

GGUF/tool_auto.py ADDED Viewed

	@@ -0,0 +1,374 @@

+# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
+import os
+import re
+import sys
+import time
+import torch
+import logging
+import argparse
+import subprocess
+import huggingface_hub as hf
+logging.getLogger().setLevel(logging.DEBUG)
+qtypes =[
+    # "F16", "BF16",
+    "Q8_0", "Q6_K",
+    "Q5_K_M", "Q5_K_S", "Q5_1", "Q5_0",
+    "Q4_K_M", "Q4_K_S", "Q4_1", "Q4_0",
+    "Q3_K_M", "Q3_K_S", "Q2_K"
+]
+dtype_dict = {
+    "F32": torch.float32,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+    "F8_E4M3": getattr(torch, "float8_e4m3fn", "_invalid"),
+    "F8_E5M2": getattr(torch, "float8_e5m2", "_invalid"),
+}
+# this is pretty jank but I want to be able to run it on a blank instance w/o setup
+terraform_dict = {
+    "repo": "city96/ComfyUI-GGUF",
+    "target": "auto_convert",
+    "lcpp_repo": "ggerganov/llama.cpp",
+    "lcpp_target": "tags/b3962",
+}
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", required=True, help="Source model file or huggingface repo name")
+    parser.add_argument("--quants", nargs="+", choices=["all", "base", *qtypes], default=["Q8_0"])
+    parser.add_argument("--output-dir", default=None, help="Location for output files, defaults to current dir or ComfyUI model dir.")
+    parser.add_argument("--temp-dir", default=None, help="Location for temp files, defaults to [output_dir]/tmp")
+    parser.add_argument("--force-update", action="store_true", help="Force update & rebuild entire quantization stack.")
+    parser.add_argument("--resume", action="store_true", help="Skip over existing files. Will NOT check for broken/interrupted files.")
+    args = parser.parse_args()
+    if args.output_dir is None:
+        args.output_dir = get_output_dir()
+    if args.temp_dir is None:
+        args.temp_dir = os.path.join(args.output_dir, "tmp")
+    if os.path.isdir(args.temp_dir) and len(os.listdir(args.temp_dir)) > 0:
+        raise OSError("Output temp folder not empty!")
+    if "all" in args.quants:
+        args.quants = ["base", *qtypes]
+    return args
+def run_cmd(*args, log_error=False):
+    logging.debug(f"cmd: {args}")
+    try:
+        log = subprocess.run(args, capture_output=True, text=True)
+    except Exception as e:
+        logging.warning(f"{args[0]}, {e}")
+        return -1
+    if log.returncode != 0 and log_error:
+        logging.warning(f"{args[0]}: {log.stdout} {log.stderr}")
+    else:
+        logging.debug(f"{args[0]}: {repr(log.stdout)} {repr(log.stderr.strip())} RET:{log.returncode}")
+    return log.returncode
+def setup_utils(force_update=False):
+    # get ComfyUI-GGUF if missing, then compile patched llama.cpp if required
+    root = os.path.dirname(os.path.abspath(__file__))
+    root = os.path.normpath(root)
+    if os.path.split(root)[1] != "tools":
+        cg_dir = os.path.join(root, "ComfyUI-GGUF")
+        if not os.path.isdir(cg_dir):
+            logging.warning(f"Running outside tools folder! Cloning to {cg_dir}")
+            run_cmd("git", "clone", f"https://github.com/{terraform_dict['repo']}", cg_dir)
+            need_update = True
+        else:
+            need_update = False
+        if force_update or need_update:
+            if terraform_dict['target']:
+                logging.info(f"Attemtping to check out ComfyUI-GGUF branch {terraform_dict['target']}")
+                run_cmd("git", "-C", cg_dir, "checkout", terraform_dict['target'])
+            logging.info("Attemtping to git pull ComfyUI-GGUF to latest")
+            run_cmd("git", "-C", cg_dir, "pull")
+        tools_dir = os.path.join(root, "ComfyUI-GGUF", "tools")
+        sys.path.append(tools_dir) # to make import(s) work
+    else:
+        # TODO: Git pull here too?
+        logging.warning(f"Assuming latest ComfyUI-GGUF. Please git pull & check out branch {terraform_dict['target']} manually!")
+        tools_dir = root
+    if not os.path.isdir(tools_dir):
+        raise OSError(f"Can't find tools subfoder in ComfyUI-GGUF at {tools_dir}")
+    convert_path = os.path.join(tools_dir, "convert.py")
+    if not os.path.isfile(convert_path):
+        raise OSError(f"Cannot find convert.py at location: {convert_path}")
+    lcpp_path = os.path.join(root, "llama.cpp.auto") # avoid messing with regular dir
+    if not os.path.isdir(lcpp_path):
+        logging.info(f"Attemtping to clone llama.cpp repo to {lcpp_path}")
+        run_cmd("git", "clone", f"https://github.com/{terraform_dict['lcpp_repo']}", lcpp_path)
+        need_update = True
+    else:
+        need_update = False
+    if force_update or need_update:
+        # TODO: check reflog and/or git reset before checkout?
+        logging.info(f"Attemtping to check out llama.cpp target {terraform_dict['lcpp_target']}")
+        run_cmd("git", "-C", lcpp_path, "checkout", terraform_dict['lcpp_target'])
+        # TODO: git reset before patch?
+        patch_path = os.path.join(tools_dir, "lcpp.patch")
+        # patch (probably) has wrong file endings:
+        logging.info("Converting patch file endings")
+        with open(patch_path, "rb") as file:
+            content = file.read().replace(b"\r\n", b"\n")
+        with open(patch_path, "wb") as file:
+            file.write(content)
+        if run_cmd("git", "-C", lcpp_path, "apply", "--check", "-R", patch_path) != 0:
+            logging.info("Attemtping to apply patch to llama.cpp repo")
+            run_cmd("git", "-C", lcpp_path, "apply", patch_path)
+        else:
+            logging.info("Patch already applied")
+    # using cmake here as llama.cpp switched to it completely for new versions
+    if os.name == "nt":
+        bin_path = os.path.join(lcpp_path, "build", "bin", "debug", "llama-quantize.exe")
+    else:
+        bin_path = os.path.join(lcpp_path, "build", "bin", "llama-quantize")
+    if not os.path.isfile(bin_path) or force_update or need_update:
+        if run_cmd("cmake", "--version") != 0:
+            raise RuntimeError("Can't find cmake! Make sure you have a working build environment set up")
+        build_path = os.path.join(lcpp_path, "build")
+        os.makedirs(build_path, exist_ok=True)
+        logging.info("Attempting to build llama.cpp binary from source")
+        run_cmd("cmake", "-B", build_path, lcpp_path)
+        run_cmd("cmake", "--build", build_path, "--config", "Debug", "-j4", "--target", "llama-quantize")
+        if not os.path.isfile(bin_path):
+            raise RuntimeError("Build failed! Rerun with --debug to see error log.")
+    else:
+        logging.info("Binary already present")
+    return bin_path
+def get_output_dir():
+    root = os.path.dirname(os.path.abspath(__file__))
+    root = os.path.normpath(root)
+    split = os.path.split(root)
+    while split[1]:
+        if split[1] == "ComfyUI":
+            if os.path.isdir(os.path.join(*split, "models", "unet")): # new
+                root = os.path.join(*split, "models", "unet", "gguf")
+                logging.info(f"Found ComfyUI, using model folder: {root}")
+                return root
+            if os.path.isdir(os.path.join(*split, "models", "diffusion_models")): # old
+                root = os.path.join(*split, "models", "diffusion_models", "gguf")
+                logging.info(f"Found ComfyUI, using model folder: {root}")
+                return root
+            logging.info("Found ComfyUI, but can't find model folder")
+            break
+        split = os.path.split(split[0])
+    root = os.path.join(root, "models")
+    logging.info(f"Defaulting to [script dir]/models: {root}")
+    return root
+def get_hf_fake_sd(repo, path, device=torch.device("meta")):
+    sd = {}
+    meta = hf.parse_safetensors_file_metadata(repo, path)
+    for key, raw in meta.tensors.items():
+        shape = tuple(raw.shape)
+        dtype = dtype_dict.get(raw.dtype, torch.float32)
+        sd[key] = torch.zeros(shape, dtype=dtype, device=device)
+    return sd
+def get_hf_file_arch(repo, path):
+    pattern = r'(\d+)-of-(\d+)'
+    match = re.search(pattern, path)
+    if match:
+        # we need to load it as multipart
+        if int(match.group(1)) != 1:
+            return None
+        sd = {}
+        for k in range(int(match.group(2))):
+            shard_path = path.replace(match.group(1), f"{k+1:0{len(match.group(1))}}")
+            sd.update(get_hf_fake_sd(repo, shard_path))
+    else:
+        sd = get_hf_fake_sd(repo, path)
+    # this should raise an error on failure
+    sd = strip_prefix(sd)
+    model_arch = detect_arch(sd)
+    # this is for SDXL and SD1.5, I want to overhaul this logic to match sd.cpp eventually
+    assert not model_arch.shape_fix, "Model uses shape fix (SDXL/SD1) - unsupported for now."
+    return model_arch.arch
+def get_hf_valid_files(repo):
+    # TODO: probably tweak this?
+    MIN_SIZE_GB = 1
+    VALID_SRC_EXTS = [".safetensors", ] # ".pt", ".ckpt", ]
+    meta = hf.model_info(repo, files_metadata=True)
+    valid = {}
+    for file in meta.siblings:
+        path = file.rfilename
+        fname = os.path.basename(path)
+        name, ext = os.path.splitext(fname)
+        if ext.lower() not in VALID_SRC_EXTS:
+            logging.debug(f"Invalid ext: {path} {ext}")
+            continue
+        if file.size / (1024 ** 3) < MIN_SIZE_GB:
+            logging.debug(f"File too small: {path} {file.size}")
+            continue
+        try:
+            arch = get_hf_file_arch(repo, path)
+        except Exception as e:
+            logging.warning(f"Arch detect fail: {e} ({path})")
+        else:
+            if arch is not None:
+                valid[path] = arch
+                logging.info(f"Found '{arch}' model at path {path}")
+    return valid
+def make_base_quant(src, output_dir, temp_dir, final=True, resume=True):
+    name, ext = os.path.splitext(os.path.basename(src))
+    if ext == ".gguf":
+        logging.info("Input file already in gguf, assuming base quant")
+        return None, src, None
+    name = name.lower() # uncomment to preserve case in all quants
+    dst_tmp = os.path.join(temp_dir, f"{name}-{{ftype}}.gguf") # ftype is filled in by convert.py
+    tmp_path, model_arch, fix_path = convert_file(src, dst_tmp, interact=False, overwrite=False)
+    dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
+    if os.path.isfile(dst_path):
+        if resume:
+            logging.warning("Resuming with interrupted base quant, may be incorrect!")
+            return dst_path, tmp_path, fix_path
+        raise OSError(f"Output already exists! Clear folder? {dst_path}")
+    if fix_path is not None and os.path.isfile(fix_path):
+        quant_source = tmp_path
+        if final:
+            apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
+        else:
+            dst_path = None
+    else:
+        fix_path = None
+        if final:
+            os.rename(tmp_path, dst_path)
+            quant_source = dst_path
+        else:
+            dst_path = None
+            quant_source = tmp_path
+    return dst_path, quant_source, fix_path
+def make_quant(src, output_dir, temp_dir, qtype, quantize_binary, fix_path=None, resume=True):
+    name, ext = os.path.splitext(os.path.basename(src))
+    assert ext.lower() == ".gguf", "Invalid input file"
+    src_qtext = [x for x in ["-F32.gguf", "-F16.gguf", "-BF16.gguf"] if x in src]
+    if len(src_qtext) == 1:
+        tmp_path = os.path.join(
+            temp_dir,
+            os.path.basename(src).replace(src_qtext[0], f"-{qtype.upper()}.gguf")
+        )
+    else:
+        tmp_path = os.path.join(
+            temp_dir,
+            f"{name}-{qtype.upper()}.gguf"
+        )
+    tmp_path = os.path.abspath(tmp_path)
+    dst_path = os.path.join(output_dir, os.path.basename(tmp_path))
+    if os.path.isfile(dst_path):
+        if resume:
+            return dst_path
+        raise OSError("Output already exists! Clear folder?")
+    r = run_cmd(quantize_binary, src, tmp_path, qtype, log_error=True)
+    time.sleep(2) # leave time for file sync?
+    if r != 0:
+        raise RuntimeError(f"Quantization failed with error code {r}")
+    if fix_path is not None:
+        apply_5d_fix(tmp_path, dst_path, fix=fix_path, overwrite=False)
+        if os.path.isfile(dst_path) and os.path.isfile(tmp_path):
+            os.remove(tmp_path)
+    else:
+        os.rename(tmp_path, dst_path)
+    return dst_path
+if __name__ == "__main__":
+    args = get_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(args.temp_dir, exist_ok=True)
+    quantize_binary = setup_utils(args.force_update)
+    try:
+        from convert import detect_arch, strip_prefix, convert_file
+        from fix_5d_tensors import apply_5d_fix
+    except [ImportError, ModuleNotFoundError] as e:
+        raise ImportError(f"Can't import required utils: {e}")
+    if not os.path.isfile(args.src):
+        # huggingface repo. TODO: file choice
+        if len(args.src.split("/")) != "1":
+            raise OSError(f"Invalid huggingface repo or model path {args.src}")
+        raise NotImplementedError("HF not yet supported")
+        # download then set to temp file
+        # hf_repo = "Lightricks/LTX-Video" # "fal/AuraFlow-v0.3"
+        # get_hf_valid_files(hf_repo)
+        # args.src = ...
+    out_files = []
+    base_quant, quant_source, fix_path = make_base_quant(
+        args.src,
+        args.output_dir,
+        args.temp_dir,
+        final=("base" in args.quants),
+        resume=args.resume,
+    )
+    if "base" in args.quants:
+        args.quants = [x for x in args.quants if x not in ["base"]]
+    if base_quant is not None:
+        out_files.append(base_quant)
+    for qtype in args.quants:
+        out_files.append(make_quant(
+            quant_source,
+            args.output_dir,
+            args.temp_dir,
+            qtype,
+            quantize_binary,
+            fix_path,
+            resume=args.resume,
+        ))
+    if fix_path is not None and os.path.isfile(fix_path):
+        os.remove(fix_path)
+    if base_quant != quant_source:
+        # make sure our quant source is in the temp folder before removing it
+        cc = os.path.commonpath([os.path.normpath(quant_source), os.path.normpath(args.temp_dir)])
+        if cc == os.path.normpath(args.temp_dir):
+            os.remove(quant_source)
+    out_file_str = '\n'.join(out_files)
+    logging.info(f"Output file(s): {out_file_str}")