bark.cpp/convert.py

353 lines
12 KiB
Python

"""Convert Bark's GPT and Encodec checkpoints into the GGML format.
The file is structured as follows:
- Hyperparameters
- Vocabulary
- Text model
- Coarse model
- Fine model
The bytes are packed in a binary file in the following order:
- Magic (`ggml` in binary format)
- Tensors
For each tensor, the bytes are packed as follows:
- Number of dimensions (int)
- Name length (int)
- Dimensions (int[n_dims])
- Name (char[name_length])
- Data (float[n_dims])
Example
-------
```bash
python convert.py \
--dir-model ~/.cache/suno/bark_v0 \
--vocab-path ./ggml_weights/ \
--out-dir ./ggml_weights/ \
--use-f16
```
"""
import argparse
from pathlib import Path
import re
import struct
import json
import numpy as np
import torch
DECODER_CONV_TRANSPOSE_LAYERS = [
"decoder.layers.3.conv.bias",
"decoder.layers.3.conv.weight",
"decoder.layers.6.conv.bias",
"decoder.layers.6.conv.weight",
"decoder.layers.9.conv.bias",
"decoder.layers.9.conv.weight",
"decoder.layers.12.conv.bias",
"decoder.layers.12.conv.weight",
]
parser = argparse.ArgumentParser()
parser.add_argument("--dir-model", type=str, required=True)
parser.add_argument("--out-dir", type=str, required=False)
parser.add_argument("--use-f16", action="store_true")
def parse_codec_hparams(config, outfile, use_f16):
"""Parse Encodec hyperparameters."""
in_channels = config["audio_channels"]
hidden_dim = config["hidden_size"]
n_filters = config["num_filters"]
kernel_size = config["kernel_size"]
residual_kernel_size = config["residual_kernel_size"]
n_bins = config["codebook_size"]
bandwidth = 24 # TODO: hardcoded
sr = config["sampling_rate"]
ftype = int(use_f16)
outfile.write(struct.pack("i", in_channels))
outfile.write(struct.pack("i", hidden_dim))
outfile.write(struct.pack("i", n_filters))
outfile.write(struct.pack("i", kernel_size))
outfile.write(struct.pack("i", residual_kernel_size))
outfile.write(struct.pack("i", n_bins))
outfile.write(struct.pack("i", bandwidth))
outfile.write(struct.pack("i", sr))
outfile.write(struct.pack("i", ftype))
def parse_hparams(config, prefix, outfile, use_f16):
"""Parse GPT hyperparameters."""
hparams = config[f"{prefix}_config"]
outfile.write(struct.pack("i", hparams["num_layers"]))
outfile.write(struct.pack("i", hparams["num_heads"]))
outfile.write(struct.pack("i", hparams["hidden_size"]))
outfile.write(struct.pack("i", hparams["block_size"]))
# trick: for fine model, we need to set the bias flag to true, since there are
# bias for the layer norm (to refactor)
bias = True if prefix == "fine_acoustics" else hparams["bias"]
outfile.write(struct.pack("i", int(bias)))
outfile.write(
struct.pack("ii", hparams["input_vocab_size"], hparams["output_vocab_size"])
)
n_lm_heads, n_wtes = None, None
try:
# only for fine text model
n_lm_heads = hparams["n_codes_total"] - hparams["n_codes_given"]
n_wtes = hparams["n_codes_total"]
except KeyError:
n_lm_heads, n_wtes = 1, 1
ftype = int(use_f16)
outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype))
def parse_codec_model_weights(checkpoint, outfile, use_f16):
"""Load encodec model checkpoint."""
keys = [k for k in checkpoint.keys() if "codec_model" in k]
for name in keys:
if "weight_g" in name:
# the tensor has already been parsed with the corresponding "weight_v"
# tensor to form the final weights tensor of the convolution, therefore
# we skip it
continue
if "inited" in name or "cluster_size" in name or "embed_avg" in name:
# "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
# for the forward pass
continue
# Remove prefix from the variable name and the dot
clean_name = name.replace("codec_model.", "")
var_data = checkpoint[name]
if not "weight_v" in name:
# if conv kernel, do not squeeze because 3d tensor
var_data = var_data.numpy().squeeze()
else:
# weight_v has its corresponding magnitude tensor to rescale the weights
# of the convolutional layers. We parse both kinds of weights jointly to
# build the final weight tensor of the convolution.
base_name = name.split(".")[:-1]
weight_g_name = ".".join(base_name + ["weight_g"])
var_data_g = checkpoint[weight_g_name]
final_var_data = torch._weight_norm(var_data, var_data_g, dim=0)
var_data = final_var_data.numpy()
name = ".".join(base_name + ["weight"])
clean_name = name.replace("codec_model.", "")
if "encoder" in clean_name or "decoder" in clean_name:
if clean_name in DECODER_CONV_TRANSPOSE_LAYERS:
pattern = r"decoder.layers.(\d+).conv\.(bias|weight)$"
replacement = r"decoder.model.\1.convtr.convtr.\2"
clean_name = re.sub(pattern, replacement, clean_name)
elif "conv" in clean_name:
pattern = r"(encoder|decoder).layers.(\d+)(.*?).conv\.(bias|weight)$"
replacement = r"\1.model.\2\3.conv.conv.\4"
clean_name = re.sub(pattern, replacement, clean_name)
elif "lstm" in clean_name:
clean_name = clean_name.replace("layers", "model")
elif "quantizer" in clean_name:
pattern = r"quantizer.layers.(\d+)\.codebook\.(.+)$"
replacement = r"quantizer.vq.layers.\1._codebook.\2"
clean_name = re.sub(pattern, replacement, clean_name)
else:
raise Exception(f"Unrecognized variable name: {clean_name}")
print(f"Processing variable: {name} with shape: {var_data.shape}")
if use_f16:
if "embed" in name:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
elif "weight" in name:
print(" Converting to float16")
var_data = var_data.astype(np.float16)
ftype_cur = 1
else:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
else:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
n_dims = len(var_data.shape)
encoded_name = clean_name.encode("utf-8")
outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))
for i in range(n_dims):
outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
outfile.write(encoded_name)
var_data.tofile(outfile)
outfile.close()
def parse_model_weights(checkpoint, prefix, outfile, use_f16):
"""Load GPT model checkpoint (text, fine, coarse)."""
keys = [k for k in checkpoint.keys() if prefix in k]
keys = [k for k in keys if "attn.bias" not in k]
num_tensors = len(keys)
outfile.write(struct.pack("i", num_tensors))
# Filter out the variables that are not part of the current model with prefix
for name in keys:
var_data = checkpoint[name].squeeze().numpy()
print(f"Processing variable: {name} with shape: {var_data.shape}")
n_dims = len(var_data.shape)
# Remove prefix from the variable name and the dot
name = name.replace(prefix + ".", "")
# rename headers to keep compatibility
if name == "layernorm_final.weight":
name = "model/ln_f/g"
elif name == "layernorm_final.bias":
name = "model/ln_f/b"
elif name == "input_embeds_layer.weight":
name = "model/wte/0"
elif re.match(r"input_embeds_layers\.\d+\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/wte/{i}"
elif name == "position_embeds_layer.weight":
name = "model/wpe"
elif name == "lm_head.weight":
name = "model/lm_head/0"
elif re.match(r"layers\.\d+\.layernorm_1\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_1/g"
elif re.match(r"layers\.\d+\.layernorm_1\.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_1/b"
elif re.match(r"layers\.\d+\.layernorm_2\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_2/g"
elif re.match(r"layers\.\d+\.layernorm_2\.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_2/b"
elif re.match(r"layers\.\d+\.attn\.bias", name):
# this pattern is the lower triangular matrix of the attention bias
# we do not need to load it
continue
elif re.match(r"layers\.\d+\.attn\.att_proj\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/attn/c_attn/w"
elif re.match(r"layers\.\d+\.attn\.out_proj\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/attn/c_proj/w"
elif re.match(r"layers\.\d+\.mlp\.in_proj\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/mlp/c_fc/w"
elif re.match(r"layers\.\d+\.mlp\.out_proj\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/mlp/c_proj/w"
elif re.match(r"lm_heads\.\d+\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/lm_head/{i}"
else:
raise Exception(f"Unrecognized variable name: {name}")
if use_f16:
if (name[-2:] == "/w" or "wte" in name or "lm_head" in name) and n_dims == 2:
print(" Converting to float16")
var_data = var_data.astype(np.float16)
ftype_cur = 1
else:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
else:
print(" Converting to float32")
var_data = var_data.astype(np.float32)
ftype_cur = 0
encoded_name = name.encode("utf-8")
outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype_cur))
for i in range(n_dims):
outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
outfile.write(encoded_name)
var_data.tofile(outfile)
def generate_file(dir_model, fout, use_f16):
checkpoint = torch.load(dir_model / "pytorch_model.bin", map_location="cpu")
config = json.load(open(dir_model / "config.json", "r"))
# Parse transformer hyperparameters and weights
for prefix in ["semantic", "coarse_acoustics", "fine_acoustics"]:
parse_hparams(config, prefix, fout, use_f16)
parse_model_weights(checkpoint, prefix, fout, use_f16)
# New model (Encodec.cpp expects the magic number) => re-write it
fout.write(struct.pack("i", 0x67676d6c))
# Parse neural codec weights
parse_codec_hparams(config["codec_config"], fout, use_f16)
parse_codec_model_weights(checkpoint, fout, use_f16)
def generate_vocab_file(dir_model, fout):
"""Parse vocabulary."""
# Even if bark relies on GPT to encode text, it uses BertTokenizer (WordPiece)
with open(dir_model / "vocab.txt", "r", encoding="utf-8") as fin:
vocab = fin.readlines()
fout.write(struct.pack("i", len(vocab)))
print("Vocab size:", len(vocab))
for token in vocab:
data = bytearray(token[:-1], "utf-8") # strip newline at the end
fout.write(struct.pack("i", len(data)))
fout.write(data)
if __name__ == "__main__":
args = parser.parse_args()
dir_model = Path(args.dir_model)
if not dir_model.exists():
raise ValueError(f"Could not find directory {dir_model}")
if args.out_dir is None:
out_dir = dir_model
else:
out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True, parents=True)
out_file = out_dir / "ggml_weights.bin"
# Write magic number
fout = open(out_file, "wb")
fout.write(struct.pack("i", 0x67676d6c))
generate_vocab_file(dir_model, fout)
print(" Vocab written.")
generate_file(dir_model, fout, args.use_f16)
print(" Model written.")
fout.close()
print("Done.")