diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bca6e7..cde1ba0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,4 +37,4 @@ endif() target_link_libraries(${ENCODEC_LIB} PUBLIC ggml) target_include_directories(${ENCODEC_LIB} PUBLIC .) -target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_11) +target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_14) diff --git a/convert.py b/convert.py index 2dcd25a..3b029a6 100644 --- a/convert.py +++ b/convert.py @@ -11,7 +11,7 @@ - Name (char[name_length]) - Data (float[n_dims]) -NOTE +Note ---- Encodec uses weight normalization for its convolutional layers. All the weights are decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple @@ -19,6 +19,15 @@ convolution from weight_v and weight_g. To drastically reduce the number of operations at inference time, the ggml weights file only contain the final convolution weights but does not store the decomposition into weight_v and weight_g. + +Usage +----- + +```bash + python convert.py \ + --dir-model ./ggml_weights/ \ + --out-dir ./ggml_weights/ +``` """ import argparse from pathlib import Path @@ -32,7 +41,11 @@ parser.add_argument("--out-dir", type=str, required=True) -def parse_model(checkpoint, outfile): +def parse_codec_model(checkpoint, out_dir): + """Load encodec model checkpoint.""" + outfile = open(out_dir, "wb") + outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic + for name in checkpoint.keys(): if "weight_g" in name: # the tensor has already been parsed with the corresponding "weight_v" @@ -40,6 +53,11 @@ def parse_model(checkpoint, outfile): # we skip it continue + if "inited" in name or "cluster_size" in name or "embed_avg" in name: + # "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used + # for the forward pass + continue + var_data = checkpoint[name] if not "weight_v" in name: @@ -49,7 +67,7 @@ def parse_model(checkpoint, outfile): # weight_v has its corresponding magnitude tensor to rescale the weights # of the convolutional layers. We parse both kinds of weights jointly to # build the final weight tensor of the convolution. - base_name = name.split(".")[:-1] + base_name = name.split(".")[:-1] weight_g_name = ".".join(base_name + ["weight_g"]) var_data_g = checkpoint[weight_g_name] @@ -75,6 +93,8 @@ def parse_model(checkpoint, outfile): var_data.tofile(outfile) + outfile.close() + if __name__ == "__main__": args = parser.parse_args() @@ -84,12 +104,9 @@ def parse_model(checkpoint, outfile): out_dir = Path(args.out_dir) out_dir.mkdir(exist_ok=True, parents=True) - outfile = open(out_dir / "ggml-model.bin", "wb") - outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic + outfile = Path(out_dir / "ggml-model.bin") checkpoint = torch.load(dir_model / "encodec_24khz-d7cc33bc.th", map_location="cpu") - parse_model(checkpoint, outfile) - - outfile.close() + parse_codec_model(checkpoint, outfile) print("Done.") diff --git a/encodec.cpp b/encodec.cpp index fbb50a5..a995884 100644 --- a/encodec.cpp +++ b/encodec.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -139,7 +140,8 @@ static struct ggml_tensor * forward_pass_lstm_unilayer( struct ggml_tensor * weight_ih, struct ggml_tensor * weight_hh, struct ggml_tensor * bias_ih, - struct ggml_tensor * bias_hh) { + struct ggml_tensor * bias_hh, + bool is_measure) { const int input_dim = inp->ne[1]; const int hidden_dim = weight_ih->ne[1]/4; @@ -150,8 +152,10 @@ static struct ggml_tensor * forward_pass_lstm_unilayer( struct ggml_tensor * c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim); struct ggml_tensor * h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim); - h_t = ggml_set_zero(h_t); - c_t = ggml_set_zero(c_t); + if (is_measure) { + h_t = ggml_set_zero(h_t); + c_t = ggml_set_zero(c_t); + } struct ggml_tensor * current = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); @@ -168,7 +172,7 @@ static struct ggml_tensor * forward_pass_lstm_unilayer( struct ggml_tensor * i_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0*sizeof(float)*hidden_dim)); struct ggml_tensor * f_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1*sizeof(float)*hidden_dim)); - struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim)); + struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim)); struct ggml_tensor * o_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3*sizeof(float)*hidden_dim)); c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t)); @@ -207,7 +211,7 @@ static struct ggml_tensor * strided_conv_transpose_1d( return unpadded; } -bool encodec_model_load(const std::string& fname, encodec_model& model) { +bool encodec_load_model_weights(const std::string& fname, encodec_model& model) { fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str()); auto infile = std::ifstream(fname, std::ios::binary); @@ -459,15 +463,9 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) { model.quantizer.blocks.resize(n_q); for (int i = 0; i < n_q; i++) { - model.quantizer.blocks[i].inited = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - model.quantizer.blocks[i].cluster_size = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_bins); model.quantizer.blocks[i].embed = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins); - model.quantizer.blocks[i].embed_avg = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins); - model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.inited"] = model.quantizer.blocks[i].inited; - model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.cluster_size"] = model.quantizer.blocks[i].cluster_size; model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed"] = model.quantizer.blocks[i].embed; - model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed_avg"] = model.quantizer.blocks[i].embed_avg; } } @@ -529,7 +527,7 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) { infile.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - // printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); model.n_loaded++; @@ -548,7 +546,7 @@ static struct ggml_cgraph * encodec_build_graph( const std::vector & inp_audio) { const int32_t audio_length = inp_audio.size(); - const auto & model = ectx.model; + const auto & model = *ectx.model; struct ggml_init_params ggml_params = { /*.mem_size =*/ ectx.buf_compute.size(), @@ -617,11 +615,13 @@ static struct ggml_cgraph * encodec_build_graph( // first lstm layer struct ggml_tensor * hs1 = forward_pass_lstm_unilayer( - ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b); + ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, + ggml_allocr_is_measure(ectx.allocr)); // second lstm layer struct ggml_tensor * out = forward_pass_lstm_unilayer( - ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b); + ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, + ggml_allocr_is_measure(ectx.allocr)); inpL = ggml_add(ctx0, inpL, out); } @@ -723,7 +723,8 @@ static struct ggml_cgraph * encodec_build_graph( const int stride = hparams.stride; struct ggml_tensor * inpL = strided_conv_1d( - ctx0, quantized_out, model.decoder.init_conv_w, model.decoder.init_conv_b, stride); + ctx0, quantized_out, model.decoder.init_conv_w, + model.decoder.init_conv_b, stride); // lstm { @@ -733,11 +734,13 @@ static struct ggml_cgraph * encodec_build_graph( // first lstm layer struct ggml_tensor * hs1 = forward_pass_lstm_unilayer( - ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b); + ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, + ggml_allocr_is_measure(ectx.allocr)); // second lstm layer struct ggml_tensor * out = forward_pass_lstm_unilayer( - ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b); + ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, + ggml_allocr_is_measure(ectx.allocr)); inpL = ggml_add(ctx0, inpL, out); } @@ -794,7 +797,7 @@ static struct ggml_cgraph * encodec_build_graph( return gf; } -bool encodec_model_eval( +bool encodec_reconstruct_audio( encodec_context & ectx, std::vector & raw_audio, int n_threads) { @@ -855,18 +858,20 @@ bool encodec_model_eval( return true; } -struct encodec_context encodec_new_context_with_model(encodec_model & model) { - encodec_context ctx = encodec_context(model); - return ctx; -} +std::shared_ptr encodec_load_model(const std::string & model_path) { + int64_t t_start_load_us = ggml_time_us(); + + encodec_context ectx; -struct encodec_model encodec_load_model_from_file(std::string fname) { - encodec_model model; - if (!encodec_model_load(fname, model)) { - fprintf(stderr, "%s: failed to load model\n", __func__); - exit(0); + ectx.model = std::make_unique(); + if (!encodec_load_model_weights(model_path, *ectx.model)) { + fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str()); + return {}; } - return model; + + ectx.t_load_us = ggml_time_us() - t_start_load_us; + + return std::make_unique(std::move(ectx)); } void encodec_free(encodec_context & ectx) { diff --git a/encodec.h b/encodec.h index 1b6df08..8270192 100644 --- a/encodec.h +++ b/encodec.h @@ -79,10 +79,7 @@ struct encodec_encoder { }; struct encodec_quant_block { - struct ggml_tensor * inited; - struct ggml_tensor * cluster_size; struct ggml_tensor * embed; - struct ggml_tensor * embed_avg; }; struct encodec_quantizer { @@ -134,16 +131,7 @@ struct encodec_model { }; struct encodec_context { - encodec_context(encodec_model & model) : model(model) {} - - ~encodec_context() { - if (model_owner) { - delete &model; - } - } - - encodec_model & model; - bool model_owner = false; + std::unique_ptr model; struct ggml_context * ctx_audio; struct ggml_tensor * reconstructed_audio; @@ -158,15 +146,13 @@ struct encodec_context { struct ggml_allocr * allocr = {}; // statistics + int64_t t_load_us = 0; int64_t t_compute_ms = 0; }; +std::shared_ptr encodec_load_model(const std::string & model_path); -struct encodec_model encodec_load_model_from_file(std::string fname); - -struct encodec_context encodec_new_context_with_model(encodec_model & model); - -bool encodec_model_eval( +bool encodec_reconstruct_audio( encodec_context & ectx, std::vector & raw_audio, int n_threads); diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index dac0a6f..0863048 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -4,7 +4,7 @@ add_executable(${TARGET} main.cpp dr_wav.h) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE encodec.cpp ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_14) if(MSVC) target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3558322..82cc792 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -14,10 +15,10 @@ struct encodec_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); // weights location - std::string model_path = "./ggml_weights"; + std::string model_path = "/Users/pbannier/Documents/encodec.cpp/ggml_weights/ggml-model.bin"; // input location - std::string original_audio_path = "./input.wav"; + std::string original_audio_path = "/Users/pbannier/Documents/encodec/test_24k.wav"; // output location std::string dest_wav_path = "output.wav"; @@ -76,7 +77,7 @@ bool read_wav_from_disk(std::string in_path, std::vector& audio_arr) { return false; } - fprintf(stderr, "Number of frames read = %lld.\n", total_frame_count); + fprintf(stderr, "%s: Number of frames read = %lld.\n", __func__, total_frame_count); audio_arr.resize(total_frame_count); memcpy(audio_arr.data(), raw_audio, total_frame_count * sizeof(float)); @@ -102,13 +103,6 @@ void write_wav_on_disk(std::vector& audio_arr, std::string dest_path) { fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames); } -struct encodec_context encodec_init_from_params(encodec_params & params) { - encodec_model model = encodec_load_model_from_file(params.model_path); - encodec_context ectx = encodec_new_context_with_model(model); - - return ectx; -} - int main(int argc, char **argv) { ggml_time_init(); const int64_t t_main_start_us = ggml_time_us(); @@ -120,15 +114,12 @@ int main(int argc, char **argv) { return 1; } - int64_t t_load_us = 0; - int64_t t_eval_us = 0; - // initialize encodec context - const int64_t t_start_us = ggml_time_us(); - encodec_context ectx = encodec_init_from_params(params); - t_load_us = ggml_time_us() - t_start_us; - - printf("\n"); + std::shared_ptr ectx = encodec_load_model(params.model_path); + if (!ectx) { + printf("%s: error during loading model\n", __func__); + return 1; + } // read audio from disk std::vector original_audio_arr; @@ -138,16 +129,14 @@ int main(int argc, char **argv) { } // reconstruct audio - const int64_t t_eval_us_start = ggml_time_us(); - if (!encodec_model_eval(ectx, original_audio_arr, params.n_threads)) { + if (!encodec_reconstruct_audio(*ectx, original_audio_arr, params.n_threads)) { printf("%s: error during inference\n", __func__); return 1; } - t_eval_us = ggml_time_us() - t_eval_us_start; // write reconstructed audio on disk - std::vector audio_arr(ectx.reconstructed_audio->ne[0]); - memcpy(ectx.reconstructed_audio->data, audio_arr.data(), audio_arr.size() * sizeof(float)); + std::vector audio_arr(ectx->reconstructed_audio->ne[0]); + memcpy(ectx->reconstructed_audio->data, audio_arr.data(), audio_arr.size() * sizeof(float)); write_wav_on_disk(audio_arr, params.dest_wav_path); // report timing @@ -155,12 +144,12 @@ int main(int argc, char **argv) { const int64_t t_main_end_us = ggml_time_us(); printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: eval time = %8.2f ms\n", __func__, t_eval_us/1000.0f); + printf("%s: load time = %8.2f ms\n", __func__, ectx->t_load_us/1000.0f); + printf("%s: eval time = %8.2f ms\n", __func__, ectx->t_compute_ms/1000.0f); printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } - encodec_free(ectx); + encodec_free(*ectx); return 0; } \ No newline at end of file