mirror of
https://github.com/PABannier/bark.cpp
synced 2026-03-05 14:40:44 +01:00
353 lines
12 KiB
Python
353 lines
12 KiB
Python
"""Convert Bark's GPT and Encodec checkpoints into the GGML format.
|
|
|
|
The file is structured as follows:
|
|
- Hyperparameters
|
|
- Vocabulary
|
|
- Text model
|
|
- Coarse model
|
|
- Fine model
|
|
|
|
The bytes are packed in a binary file in the following order:
|
|
- Magic (`ggml` in binary format)
|
|
- Tensors
|
|
|
|
For each tensor, the bytes are packed as follows:
|
|
- Number of dimensions (int)
|
|
- Name length (int)
|
|
- Dimensions (int[n_dims])
|
|
- Name (char[name_length])
|
|
- Data (float[n_dims])
|
|
|
|
Example
|
|
-------
|
|
```bash
|
|
python convert.py \
|
|
--dir-model ~/.cache/suno/bark_v0 \
|
|
--vocab-path ./ggml_weights/ \
|
|
--out-dir ./ggml_weights/ \
|
|
--use-f16
|
|
```
|
|
"""
|
|
import argparse
|
|
from pathlib import Path
|
|
import re
|
|
import struct
|
|
import json
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
|
|
DECODER_CONV_TRANSPOSE_LAYERS = [
|
|
"decoder.layers.3.conv.bias",
|
|
"decoder.layers.3.conv.weight",
|
|
"decoder.layers.6.conv.bias",
|
|
"decoder.layers.6.conv.weight",
|
|
"decoder.layers.9.conv.bias",
|
|
"decoder.layers.9.conv.weight",
|
|
"decoder.layers.12.conv.bias",
|
|
"decoder.layers.12.conv.weight",
|
|
]
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dir-model", type=str, required=True)
|
|
parser.add_argument("--out-dir", type=str, required=False)
|
|
parser.add_argument("--use-f16", action="store_true")
|
|
|
|
|
|
def parse_codec_hparams(config, outfile, use_f16):
|
|
"""Parse Encodec hyperparameters."""
|
|
in_channels = config["audio_channels"]
|
|
hidden_dim = config["hidden_size"]
|
|
n_filters = config["num_filters"]
|
|
kernel_size = config["kernel_size"]
|
|
residual_kernel_size = config["residual_kernel_size"]
|
|
n_bins = config["codebook_size"]
|
|
bandwidth = 24 # TODO: hardcoded
|
|
sr = config["sampling_rate"]
|
|
ftype = int(use_f16)
|
|
|
|
outfile.write(struct.pack("i", in_channels))
|
|
outfile.write(struct.pack("i", hidden_dim))
|
|
outfile.write(struct.pack("i", n_filters))
|
|
outfile.write(struct.pack("i", kernel_size))
|
|
outfile.write(struct.pack("i", residual_kernel_size))
|
|
outfile.write(struct.pack("i", n_bins))
|
|
outfile.write(struct.pack("i", bandwidth))
|
|
outfile.write(struct.pack("i", sr))
|
|
outfile.write(struct.pack("i", ftype))
|
|
|
|
|
|
def parse_hparams(config, prefix, outfile, use_f16):
|
|
"""Parse GPT hyperparameters."""
|
|
hparams = config[f"{prefix}_config"]
|
|
|
|
outfile.write(struct.pack("i", hparams["num_layers"]))
|
|
outfile.write(struct.pack("i", hparams["num_heads"]))
|
|
outfile.write(struct.pack("i", hparams["hidden_size"]))
|
|
outfile.write(struct.pack("i", hparams["block_size"]))
|
|
|
|
# trick: for fine model, we need to set the bias flag to true, since there are
|
|
# bias for the layer norm (to refactor)
|
|
bias = True if prefix == "fine_acoustics" else hparams["bias"]
|
|
outfile.write(struct.pack("i", int(bias)))
|
|
|
|
outfile.write(
|
|
struct.pack("ii", hparams["input_vocab_size"], hparams["output_vocab_size"])
|
|
)
|
|
|
|
n_lm_heads, n_wtes = None, None
|
|
try:
|
|
# only for fine text model
|
|
n_lm_heads = hparams["n_codes_total"] - hparams["n_codes_given"]
|
|
n_wtes = hparams["n_codes_total"]
|
|
except KeyError:
|
|
n_lm_heads, n_wtes = 1, 1
|
|
|
|
ftype = int(use_f16)
|
|
|
|
outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype))
|
|
|
|
|
|
def parse_codec_model_weights(checkpoint, outfile, use_f16):
|
|
"""Load encodec model checkpoint."""
|
|
keys = [k for k in checkpoint.keys() if "codec_model" in k]
|
|
|
|
for name in keys:
|
|
if "weight_g" in name:
|
|
# the tensor has already been parsed with the corresponding "weight_v"
|
|
# tensor to form the final weights tensor of the convolution, therefore
|
|
# we skip it
|
|
continue
|
|
|
|
if "inited" in name or "cluster_size" in name or "embed_avg" in name:
|
|
# "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
|
|
# for the forward pass
|
|
continue
|
|
|
|
# Remove prefix from the variable name and the dot
|
|
clean_name = name.replace("codec_model.", "")
|
|
|
|
var_data = checkpoint[name]
|
|
|
|
if not "weight_v" in name:
|
|
# if conv kernel, do not squeeze because 3d tensor
|
|
var_data = var_data.numpy().squeeze()
|
|
else:
|
|
# weight_v has its corresponding magnitude tensor to rescale the weights
|
|
# of the convolutional layers. We parse both kinds of weights jointly to
|
|
# build the final weight tensor of the convolution.
|
|
base_name = name.split(".")[:-1]
|
|
weight_g_name = ".".join(base_name + ["weight_g"])
|
|
var_data_g = checkpoint[weight_g_name]
|
|
|
|
final_var_data = torch._weight_norm(var_data, var_data_g, dim=0)
|
|
var_data = final_var_data.numpy()
|
|
|
|
name = ".".join(base_name + ["weight"])
|
|
clean_name = name.replace("codec_model.", "")
|
|
|
|
if "encoder" in clean_name or "decoder" in clean_name:
|
|
if clean_name in DECODER_CONV_TRANSPOSE_LAYERS:
|
|
pattern = r"decoder.layers.(\d+).conv\.(bias|weight)$"
|
|
replacement = r"decoder.model.\1.convtr.convtr.\2"
|
|
clean_name = re.sub(pattern, replacement, clean_name)
|
|
elif "conv" in clean_name:
|
|
pattern = r"(encoder|decoder).layers.(\d+)(.*?).conv\.(bias|weight)$"
|
|
replacement = r"\1.model.\2\3.conv.conv.\4"
|
|
clean_name = re.sub(pattern, replacement, clean_name)
|
|
elif "lstm" in clean_name:
|
|
clean_name = clean_name.replace("layers", "model")
|
|
elif "quantizer" in clean_name:
|
|
pattern = r"quantizer.layers.(\d+)\.codebook\.(.+)$"
|
|
replacement = r"quantizer.vq.layers.\1._codebook.\2"
|
|
clean_name = re.sub(pattern, replacement, clean_name)
|
|
else:
|
|
raise Exception(f"Unrecognized variable name: {clean_name}")
|
|
|
|
print(f"Processing variable: {name} with shape: {var_data.shape}")
|
|
|
|
if use_f16:
|
|
if "embed" in name:
|
|
print(" Converting to float32")
|
|
var_data = var_data.astype(np.float32)
|
|
ftype_cur = 0
|
|
elif "weight" in name:
|
|
print(" Converting to float16")
|
|
var_data = var_data.astype(np.float16)
|
|
ftype_cur = 1
|
|
else:
|
|
print(" Converting to float32")
|
|
var_data = var_data.astype(np.float32)
|
|
ftype_cur = 0
|
|
else:
|
|
print(" Converting to float32")
|
|
var_data = var_data.astype(np.float32)
|
|
ftype_cur = 0
|
|
|
|
n_dims = len(var_data.shape)
|
|
encoded_name = clean_name.encode("utf-8")
|
|
outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))
|
|
|
|
for i in range(n_dims):
|
|
outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
|
|
outfile.write(encoded_name)
|
|
|
|
var_data.tofile(outfile)
|
|
|
|
outfile.close()
|
|
|
|
|
|
def parse_model_weights(checkpoint, prefix, outfile, use_f16):
|
|
"""Load GPT model checkpoint (text, fine, coarse)."""
|
|
keys = [k for k in checkpoint.keys() if prefix in k]
|
|
keys = [k for k in keys if "attn.bias" not in k]
|
|
|
|
num_tensors = len(keys)
|
|
outfile.write(struct.pack("i", num_tensors))
|
|
|
|
# Filter out the variables that are not part of the current model with prefix
|
|
|
|
for name in keys:
|
|
var_data = checkpoint[name].squeeze().numpy()
|
|
print(f"Processing variable: {name} with shape: {var_data.shape}")
|
|
|
|
n_dims = len(var_data.shape)
|
|
|
|
# Remove prefix from the variable name and the dot
|
|
name = name.replace(prefix + ".", "")
|
|
|
|
# rename headers to keep compatibility
|
|
if name == "layernorm_final.weight":
|
|
name = "model/ln_f/g"
|
|
elif name == "layernorm_final.bias":
|
|
name = "model/ln_f/b"
|
|
elif name == "input_embeds_layer.weight":
|
|
name = "model/wte/0"
|
|
elif re.match(r"input_embeds_layers\.\d+\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/wte/{i}"
|
|
elif name == "position_embeds_layer.weight":
|
|
name = "model/wpe"
|
|
elif name == "lm_head.weight":
|
|
name = "model/lm_head/0"
|
|
elif re.match(r"layers\.\d+\.layernorm_1\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/ln_1/g"
|
|
elif re.match(r"layers\.\d+\.layernorm_1\.bias", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/ln_1/b"
|
|
elif re.match(r"layers\.\d+\.layernorm_2\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/ln_2/g"
|
|
elif re.match(r"layers\.\d+\.layernorm_2\.bias", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/ln_2/b"
|
|
elif re.match(r"layers\.\d+\.attn\.bias", name):
|
|
# this pattern is the lower triangular matrix of the attention bias
|
|
# we do not need to load it
|
|
continue
|
|
elif re.match(r"layers\.\d+\.attn\.att_proj\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/attn/c_attn/w"
|
|
elif re.match(r"layers\.\d+\.attn\.out_proj\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/attn/c_proj/w"
|
|
elif re.match(r"layers\.\d+\.mlp\.in_proj\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/mlp/c_fc/w"
|
|
elif re.match(r"layers\.\d+\.mlp\.out_proj\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/h{i}/mlp/c_proj/w"
|
|
elif re.match(r"lm_heads\.\d+\.weight", name):
|
|
i = re.findall("\d+", name)[0]
|
|
name = f"model/lm_head/{i}"
|
|
else:
|
|
raise Exception(f"Unrecognized variable name: {name}")
|
|
|
|
if use_f16:
|
|
if (name[-2:] == "/w" or "wte" in name or "lm_head" in name) and n_dims == 2:
|
|
print(" Converting to float16")
|
|
var_data = var_data.astype(np.float16)
|
|
ftype_cur = 1
|
|
else:
|
|
print(" Converting to float32")
|
|
var_data = var_data.astype(np.float32)
|
|
ftype_cur = 0
|
|
else:
|
|
print(" Converting to float32")
|
|
var_data = var_data.astype(np.float32)
|
|
ftype_cur = 0
|
|
|
|
encoded_name = name.encode("utf-8")
|
|
|
|
outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))
|
|
for i in range(n_dims):
|
|
outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
|
|
outfile.write(encoded_name)
|
|
|
|
var_data.tofile(outfile)
|
|
|
|
|
|
def generate_file(dir_model, fout, use_f16):
|
|
checkpoint = torch.load(dir_model / "pytorch_model.bin", map_location="cpu")
|
|
config = json.load(open(dir_model / "config.json", "r"))
|
|
|
|
# Parse transformer hyperparameters and weights
|
|
for prefix in ["semantic", "coarse_acoustics", "fine_acoustics"]:
|
|
parse_hparams(config, prefix, fout, use_f16)
|
|
parse_model_weights(checkpoint, prefix, fout, use_f16)
|
|
|
|
# New model (Encodec.cpp expects the magic number) => re-write it
|
|
fout.write(struct.pack("i", 0x67676d6c))
|
|
|
|
# Parse neural codec weights
|
|
parse_codec_hparams(config["codec_config"], fout, use_f16)
|
|
parse_codec_model_weights(checkpoint, fout, use_f16)
|
|
|
|
|
|
def generate_vocab_file(dir_model, fout):
|
|
"""Parse vocabulary."""
|
|
# Even if bark relies on GPT to encode text, it uses BertTokenizer (WordPiece)
|
|
with open(dir_model / "vocab.txt", "r", encoding="utf-8") as fin:
|
|
vocab = fin.readlines()
|
|
|
|
fout.write(struct.pack("i", len(vocab)))
|
|
print("Vocab size:", len(vocab))
|
|
|
|
for token in vocab:
|
|
data = bytearray(token[:-1], "utf-8") # strip newline at the end
|
|
fout.write(struct.pack("i", len(data)))
|
|
fout.write(data)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parser.parse_args()
|
|
|
|
dir_model = Path(args.dir_model)
|
|
if not dir_model.exists():
|
|
raise ValueError(f"Could not find directory {dir_model}")
|
|
|
|
if args.out_dir is None:
|
|
out_dir = dir_model
|
|
else:
|
|
out_dir = Path(args.out_dir)
|
|
out_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
out_file = out_dir / "ggml_weights.bin"
|
|
|
|
# Write magic number
|
|
fout = open(out_file, "wb")
|
|
fout.write(struct.pack("i", 0x67676d6c))
|
|
|
|
generate_vocab_file(dir_model, fout)
|
|
print(" Vocab written.")
|
|
|
|
generate_file(dir_model, fout, args.use_f16)
|
|
print(" Model written.")
|
|
|
|
fout.close()
|
|
|
|
print("Done.")
|