bark.cpp/convert.py

"""Convert Bark's GPT and Encodec checkpoints into the GGML format.

The file is structured as follows:
    - Hyperparameters
    - Vocabulary
    - Text model
    - Coarse model
    - Fine model

The bytes are packed in a binary file in the following order:
    - Magic (`ggml` in binary format)
    - Tensors

For each tensor, the bytes are packed as follows:
    - Number of dimensions    (int)
    - Name length             (int)
    - Dimensions              (int[n_dims])
    - Name                    (char[name_length])
    - Data                    (float[n_dims])

Example
-------
```bash
    python convert.py \
        --dir-model ~/.cache/suno/bark_v0 \
        --vocab-path ./ggml_weights/ \
        --out-dir ./ggml_weights/ \
        --use-f16
```
"""
import argparse
from pathlib import Path
import re
import struct
import json

import numpy as np
import torch


DECODER_CONV_TRANSPOSE_LAYERS = [
    "decoder.layers.3.conv.bias",
    "decoder.layers.3.conv.weight",
    "decoder.layers.6.conv.bias",
    "decoder.layers.6.conv.weight",
    "decoder.layers.9.conv.bias",
    "decoder.layers.9.conv.weight",
    "decoder.layers.12.conv.bias",
    "decoder.layers.12.conv.weight",
]


parser = argparse.ArgumentParser()
parser.add_argument("--dir-model", type=str, required=True)
parser.add_argument("--out-dir", type=str, required=False)
parser.add_argument("--use-f16", action="store_true")


def parse_codec_hparams(config, outfile, use_f16):
    """Parse Encodec hyperparameters."""
    in_channels = config["audio_channels"]
    hidden_dim = config["hidden_size"]
    n_filters = config["num_filters"]
    kernel_size = config["kernel_size"]
    residual_kernel_size = config["residual_kernel_size"]
    n_bins = config["codebook_size"]
    bandwidth = 24   # TODO: hardcoded
    sr = config["sampling_rate"]
    ftype = int(use_f16)

    outfile.write(struct.pack("i", in_channels))
    outfile.write(struct.pack("i", hidden_dim))
    outfile.write(struct.pack("i", n_filters))
    outfile.write(struct.pack("i", kernel_size))
    outfile.write(struct.pack("i", residual_kernel_size))
    outfile.write(struct.pack("i", n_bins))
    outfile.write(struct.pack("i", bandwidth))
    outfile.write(struct.pack("i", sr))
    outfile.write(struct.pack("i", ftype))


def parse_hparams(config, prefix, outfile, use_f16):
    """Parse GPT hyperparameters."""
    hparams = config[f"{prefix}_config"]

    outfile.write(struct.pack("i", hparams["num_layers"]))
    outfile.write(struct.pack("i", hparams["num_heads"]))
    outfile.write(struct.pack("i", hparams["hidden_size"]))
    outfile.write(struct.pack("i", hparams["block_size"]))

    # trick: for fine model, we need to set the bias flag to true, since there are
    # bias for the layer norm (to refactor)
    bias = True if prefix == "fine_acoustics" else hparams["bias"]
    outfile.write(struct.pack("i", int(bias)))

    outfile.write(
        struct.pack("ii", hparams["input_vocab_size"], hparams["output_vocab_size"])
    )

    n_lm_heads, n_wtes = None, None
    try:
        # only for fine text model
        n_lm_heads = hparams["n_codes_total"] - hparams["n_codes_given"]
        n_wtes = hparams["n_codes_total"]
    except KeyError:
        n_lm_heads, n_wtes = 1, 1

    ftype = int(use_f16)

    outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype))


def parse_codec_model_weights(checkpoint, outfile, use_f16):
    """Load encodec model checkpoint."""
    keys = [k for k in checkpoint.keys() if "codec_model" in k]

    for name in keys:
        if "weight_g" in name:
            # the tensor has already been parsed with the corresponding "weight_v"
            # tensor to form the final weights tensor of the convolution, therefore
            # we skip it
            continue

        if "inited" in name or "cluster_size" in name or "embed_avg" in name:
            # "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
            # for the forward pass
            continue

        # Remove prefix from the variable name and the dot
        clean_name = name.replace("codec_model.", "")

        var_data = checkpoint[name]

        if not "weight_v" in name:
            # if conv kernel, do not squeeze because 3d tensor
            var_data = var_data.numpy().squeeze()
        else:
            # weight_v has its corresponding magnitude tensor to rescale the weights
            # of the convolutional layers. We parse both kinds of weights jointly to
            # build the final weight tensor of the convolution.
            base_name = name.split(".")[:-1]
            weight_g_name = ".".join(base_name + ["weight_g"])
            var_data_g = checkpoint[weight_g_name]

            final_var_data = torch._weight_norm(var_data, var_data_g, dim=0)
            var_data = final_var_data.numpy()

            name = ".".join(base_name + ["weight"])
            clean_name = name.replace("codec_model.", "")

        if "encoder" in clean_name or "decoder" in clean_name:
            if clean_name in DECODER_CONV_TRANSPOSE_LAYERS:
                pattern = r"decoder.layers.(\d+).conv\.(bias|weight)$"
                replacement = r"decoder.model.\1.convtr.convtr.\2"
                clean_name = re.sub(pattern, replacement, clean_name)
            elif "conv" in clean_name:
                pattern = r"(encoder|decoder).layers.(\d+)(.*?).conv\.(bias|weight)$"
                replacement = r"\1.model.\2\3.conv.conv.\4"
                clean_name = re.sub(pattern, replacement, clean_name)
            elif "lstm" in clean_name:
                clean_name = clean_name.replace("layers", "model")
        elif "quantizer" in clean_name:
            pattern = r"quantizer.layers.(\d+)\.codebook\.(.+)$"
            replacement = r"quantizer.vq.layers.\1._codebook.\2"
            clean_name = re.sub(pattern, replacement, clean_name)
        else:
            raise Exception(f"Unrecognized variable name: {clean_name}")

        print(f"Processing variable: {name} with shape: {var_data.shape}")

        if use_f16:
            if "embed" in name:
                print("  Converting to float32")
                var_data = var_data.astype(np.float32)
                ftype_cur = 0
            elif "weight" in name:
                print("  Converting to float16")
                var_data = var_data.astype(np.float16)
                ftype_cur = 1
            else:
                print("  Converting to float32")
                var_data = var_data.astype(np.float32)
                ftype_cur = 0
        else:
            print("  Converting to float32")
            var_data = var_data.astype(np.float32)
            ftype_cur = 0

        n_dims = len(var_data.shape)
        encoded_name = clean_name.encode("utf-8")
        outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))

        for i in range(n_dims):
            outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
        outfile.write(encoded_name)

        var_data.tofile(outfile)

    outfile.close()


def parse_model_weights(checkpoint, prefix, outfile, use_f16):
    """Load GPT model checkpoint (text, fine, coarse)."""
    keys = [k for k in checkpoint.keys() if prefix in k]
    keys = [k for k in keys if "attn.bias" not in k]

    num_tensors = len(keys)
    outfile.write(struct.pack("i", num_tensors))

    # Filter out the variables that are not part of the current model with prefix

    for name in keys:
        var_data = checkpoint[name].squeeze().numpy()
        print(f"Processing variable: {name} with shape: {var_data.shape}")

        n_dims = len(var_data.shape)

        # Remove prefix from the variable name and the dot
        name = name.replace(prefix + ".", "")

        # rename headers to keep compatibility
        if name == "layernorm_final.weight":
            name = "model/ln_f/g"
        elif name == "layernorm_final.bias":
            name = "model/ln_f/b"
        elif name == "input_embeds_layer.weight":
            name = "model/wte/0"
        elif re.match(r"input_embeds_layers\.\d+\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/wte/{i}"
        elif name == "position_embeds_layer.weight":
            name = "model/wpe"
        elif name == "lm_head.weight":
            name = "model/lm_head/0"
        elif re.match(r"layers\.\d+\.layernorm_1\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/ln_1/g"
        elif re.match(r"layers\.\d+\.layernorm_1\.bias", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/ln_1/b"
        elif re.match(r"layers\.\d+\.layernorm_2\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/ln_2/g"
        elif re.match(r"layers\.\d+\.layernorm_2\.bias", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/ln_2/b"
        elif re.match(r"layers\.\d+\.attn\.bias", name):
            # this pattern is the lower triangular matrix of the attention bias
            # we do not need to load it
            continue
        elif re.match(r"layers\.\d+\.attn\.att_proj\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/attn/c_attn/w"
        elif re.match(r"layers\.\d+\.attn\.out_proj\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/attn/c_proj/w"
        elif re.match(r"layers\.\d+\.mlp\.in_proj\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/mlp/c_fc/w"
        elif re.match(r"layers\.\d+\.mlp\.out_proj\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/h{i}/mlp/c_proj/w"
        elif re.match(r"lm_heads\.\d+\.weight", name):
            i = re.findall("\d+", name)[0]
            name = f"model/lm_head/{i}"
        else:
            raise Exception(f"Unrecognized variable name: {name}")

        if use_f16:
            if (name[-2:] == "/w" or "wte" in name or "lm_head" in name) and n_dims == 2:
                print("  Converting to float16")
                var_data = var_data.astype(np.float16)
                ftype_cur = 1
            else:
                print("  Converting to float32")
                var_data = var_data.astype(np.float32)
                ftype_cur = 0
        else:
                print("  Converting to float32")
                var_data = var_data.astype(np.float32)
                ftype_cur = 0

        encoded_name = name.encode("utf-8")

        outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))
        for i in range(n_dims):
            outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
        outfile.write(encoded_name)

        var_data.tofile(outfile)


def generate_file(dir_model, fout, use_f16):
    checkpoint = torch.load(dir_model / "pytorch_model.bin", map_location="cpu")
    config = json.load(open(dir_model / "config.json", "r"))

    # Parse transformer hyperparameters and weights
    for prefix in ["semantic", "coarse_acoustics", "fine_acoustics"]:
        parse_hparams(config, prefix, fout, use_f16)
        parse_model_weights(checkpoint, prefix, fout, use_f16)

    # New model (Encodec.cpp expects the magic number) => re-write it
    fout.write(struct.pack("i", 0x67676d6c))

    # Parse neural codec weights
    parse_codec_hparams(config["codec_config"], fout, use_f16)
    parse_codec_model_weights(checkpoint, fout, use_f16)


def generate_vocab_file(dir_model, fout):
    """Parse vocabulary."""
    # Even if bark relies on GPT to encode text, it uses BertTokenizer (WordPiece)
    with open(dir_model / "vocab.txt", "r", encoding="utf-8") as fin:
        vocab = fin.readlines()

    fout.write(struct.pack("i", len(vocab)))
    print("Vocab size:", len(vocab))

    for token in vocab:
        data = bytearray(token[:-1], "utf-8")  # strip newline at the end
        fout.write(struct.pack("i", len(data)))
        fout.write(data)


if __name__ == "__main__":
    args = parser.parse_args()

    dir_model = Path(args.dir_model)
    if not dir_model.exists():
        raise ValueError(f"Could not find directory {dir_model}")

    if args.out_dir is None:
        out_dir = dir_model
    else:
        out_dir = Path(args.out_dir)
        out_dir.mkdir(exist_ok=True, parents=True)

    out_file = out_dir / "ggml_weights.bin"

    # Write magic number
    fout = open(out_file, "wb")
    fout.write(struct.pack("i", 0x67676d6c))

    generate_vocab_file(dir_model, fout)
    print(" Vocab written.")

    generate_file(dir_model, fout, args.use_f16)
    print(" Model written.")

    fout.close()

    print("Done.")