/*This script quantizes the weights of the 3 GPT encoders. 5 quantization types are available: - q4_0 - q4_1 - q5_0 - q5_1 - q8_0 Usage: ./quantize \ ./ggml_weights/ggml_weights_text.bin \ ./ggml_weights_q4/ggml_weights_text_quant.bin \ type */ #include "ggml.h" #include "bark.h" #include "bark-util.h" #include #include #include #include #include #include #include #include #include static const std::map GGML_FTYPE_MAP = { {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, }; void ggml_print_ftypes(FILE * fp) { for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); } } enum ggml_ftype ggml_parse_ftype(const char * str) { enum ggml_ftype ftype; if (str[0] == 'q') { const auto it = GGML_FTYPE_MAP.find(str); if (it == GGML_FTYPE_MAP.end()) { fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); return GGML_FTYPE_UNKNOWN; } ftype = it->second; } else { ftype = (enum ggml_ftype) atoi(str); } return ftype; } bool ggml_common_quantize_0( std::ifstream & fin, std::ofstream & fout, const ggml_ftype ftype, const std::vector & to_quant, const std::vector & to_skip) { ggml_type qtype = GGML_TYPE_F32; switch (ftype) { case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; case GGML_FTYPE_UNKNOWN: case GGML_FTYPE_ALL_F32: case GGML_FTYPE_MOSTLY_F16: case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: case GGML_FTYPE_MOSTLY_Q2_K: case GGML_FTYPE_MOSTLY_Q3_K: case GGML_FTYPE_MOSTLY_Q4_K: case GGML_FTYPE_MOSTLY_Q5_K: case GGML_FTYPE_MOSTLY_Q6_K: { fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); return false; } }; if (!ggml_is_quantized(qtype)) { fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); return false; } size_t total_size_org = 0; size_t total_size_new = 0; std::vector work; std::vector data_u8; std::vector data_f16; std::vector data_f32; std::vector hist_all(1 << 4, 0); while (true) { int32_t n_dims; int32_t length; int32_t ttype; read_safe(fin, n_dims); read_safe(fin, length); read_safe(fin, ttype); if (fin.eof()) { break; } int32_t nelements = 1; int32_t ne[4] = { 1, 1, 1, 1 }; for (int i = 0; i < n_dims; ++i) { read_safe(fin, ne[i]); nelements *= ne[i]; } std::string name(length, 0); fin.read(&name[0], length); printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); bool quantize = false; // check if we should quantize this tensor for (const auto & s : to_quant) { if (std::regex_match(name, std::regex(s))) { quantize = true; break; } } // check if we should skip this tensor for (const auto & s : to_skip) { if (std::regex_match(name, std::regex(s))) { quantize = false; break; } } // quantize only 2D tensors quantize &= (n_dims == 2); if (quantize) { if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); return false; } if (ttype == GGML_TYPE_F16) { data_f16.resize(nelements); fin.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); data_f32.resize(nelements); for (int i = 0; i < nelements; ++i) { data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); } } else { data_f32.resize(nelements); fin.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); } ttype = qtype; } else { const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t); data_u8.resize(nelements*bpe); fin.read(reinterpret_cast(data_u8.data()), nelements * bpe); } write_safe(fout, n_dims); write_safe(fout, length); write_safe(fout, ttype); for (int i = 0; i < n_dims; ++i) { write_safe(fout, ne[i]); } fout.write(&name[0], length); if (quantize) { work.resize(nelements); // for quantization size_t cur_size = 0; std::vector hist_cur(1 << 4, 0); switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: { cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q4_1: { cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q5_0: { cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q5_1: { cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q8_0: { cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: case GGML_TYPE_Q8_1: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: case GGML_TYPE_COUNT: { fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); return false; } } fout.write(reinterpret_cast(work.data()), cur_size); total_size_new += cur_size; printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); for (int i = 0; i < (int) hist_cur.size(); ++i) { hist_all[i] += hist_cur[i]; } for (int i = 0; i < (int) hist_cur.size(); ++i) { printf("%5.3f ", hist_cur[i] / (float)nelements); } printf("\n"); } else { printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); total_size_new += data_u8.size(); } total_size_org += nelements * sizeof(float); } printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); { int64_t sum_all = 0; for (int i = 0; i < (int) hist_all.size(); ++i) { sum_all += hist_all[i]; } printf("%s: hist: ", __func__); for (int i = 0; i < (int) hist_all.size(); ++i) { printf("%5.3f ", hist_all[i] / (float)sum_all); } printf("\n"); } return true; } bool bark_model_quantize( const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); gpt_model model; auto fin = std::ifstream(fname_inp, std::ios::binary); if (!fin) { fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); return false; } auto fout = std::ofstream(fname_out, std::ios::binary); if (!fout) { fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); return false; } // verify magic { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); if (magic != GGML_FILE_MAGIC) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); return false; } fout.write((char *) &magic, sizeof(magic)); } gpt_hparams hparams; // load hparams { auto & hparams = model.hparams; read_safe(fin, hparams.n_layer); read_safe(fin, hparams.n_head); read_safe(fin, hparams.n_embd); read_safe(fin, hparams.block_size); read_safe(fin, hparams.n_in_vocab); read_safe(fin, hparams.n_out_vocab); read_safe(fin, hparams.n_lm_heads); read_safe(fin, hparams.n_wtes); read_safe(fin, hparams.ftype); const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; printf("%s: n_in_vocab = %d\n", __func__, hparams.n_in_vocab); printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab); printf("%s: block_size = %d\n", __func__, hparams.block_size); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_lm_heads = %d\n", __func__, hparams.n_lm_heads); printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes); printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); write_safe(fout, hparams.n_layer); write_safe(fout, hparams.n_head); write_safe(fout, hparams.n_embd); write_safe(fout, hparams.block_size); write_safe(fout, hparams.n_in_vocab); write_safe(fout, hparams.n_out_vocab); write_safe(fout, hparams.n_lm_heads); write_safe(fout, hparams.n_wtes); write_safe(fout, ftype_dst); } // regexes of tensor names to be quantized const std::vector to_quant = { "model/wte/.*", "model/lm_head/.*", "model/h.*/attn/c_attn/w", "model/h.*/attn/c_proj/w", "model/h.*/mlp/c_fc/w", "model/h.*/mlp/c_proj/w", }; if (!ggml_common_quantize_0(fin, fout, ftype, to_quant, {})) { fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); return false; } fin.close(); fout.close(); return true; } int main(int argc, char ** argv) { if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); ggml_print_ftypes(stderr); return 1; } // needed to initialize f16 tables { struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const int64_t t_main_start_us = ggml_time_us(); int64_t t_quantize_us = 0; // load the model { const int64_t t_start_us = ggml_time_us(); if (!bark_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } t_quantize_us = ggml_time_us() - t_start_us; } // report timing { const int64_t t_main_end_us = ggml_time_us(); printf("\n"); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } return 0; }