/* This script quantizes the weights of the 3 GPT encoders. 5 quantization types are available: - q4_0 - q4_1 - q5_0 - q5_1 - q8_0 Usage: ```bash ./quantize \ ./ggml_weights.bin \ ./ggml_weights_q4.bin \ q4_0 ``` */ #include #include #include #include #include #include #include #include #include #include "bark.h" #include "ggml.h" static const std::map GGML_FTYPE_MAP = { {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, }; void ggml_print_ftypes(FILE* fp) { for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); } } enum ggml_ftype ggml_parse_ftype(const char* str) { enum ggml_ftype ftype; if (str[0] == 'q') { const auto it = GGML_FTYPE_MAP.find(str); if (it == GGML_FTYPE_MAP.end()) { fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); return GGML_FTYPE_UNKNOWN; } ftype = it->second; } else { ftype = (enum ggml_ftype)atoi(str); } return ftype; } int main(int argc, char** argv) { if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); ggml_print_ftypes(stderr); return 1; } // needed to initialize f16 tables { struct ggml_init_params params = {0, NULL, false}; struct ggml_context* ctx = ggml_init(params); ggml_free(ctx); } const char* fname_inp = argv[1]; const char* fname_out = argv[2]; const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const int64_t t_main_start_us = ggml_time_us(); int64_t t_quantize_us = 0; // load the model { const int64_t t_start_us = ggml_time_us(); bark_model_quantize(fname_inp, fname_out, ggml_ftype(ftype)); t_quantize_us = ggml_time_us() - t_start_us; } // report timing { const int64_t t_main_end_us = ggml_time_us(); printf("\n"); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f); printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); } return 0; }