fix: encodec forward pass (#9)

PABannier · Oct 5, 2023 · e59b55d · e59b55d
1 parent 8f5c964
commit e59b55d
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 83 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,4 +37,4 @@ endif()
 
 target_link_libraries(${ENCODEC_LIB} PUBLIC ggml)
 target_include_directories(${ENCODEC_LIB} PUBLIC .)
-target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_11)
+target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_14)
diff --git a/convert.py b/convert.py
@@ -11,14 +11,23 @@
     - Name                    (char[name_length])
     - Data                    (float[n_dims])
 
-NOTE
+Note
 ----
 Encodec uses weight normalization for its convolutional layers. All the weights are
 decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple
 call to the hook torch._weight_norm allows to get the final weight tensor of the
 convolution from weight_v and weight_g. To drastically reduce the number of operations
 at inference time, the ggml weights file only contain the final convolution weights but
 does not store the decomposition into weight_v and weight_g.
+
+Usage
+-----
+
+```bash
+    python convert.py \
+        --dir-model ./ggml_weights/ \
+        --out-dir ./ggml_weights/
+```
 """
 import argparse
 from pathlib import Path
@@ -32,14 +41,23 @@
 parser.add_argument("--out-dir", type=str, required=True)
 
 
-def parse_model(checkpoint, outfile):
+def parse_codec_model(checkpoint, out_dir):
+    """Load encodec model checkpoint."""
+    outfile = open(out_dir, "wb")
+    outfile.write(struct.pack("i", 0x67676d6c))  # ggml magic
+
     for name in checkpoint.keys():
         if "weight_g" in name:
             # the tensor has already been parsed with the corresponding "weight_v"
             # tensor to form the final weights tensor of the convolution, therefore
             # we skip it
             continue
 
+        if "inited" in name or "cluster_size" in name or "embed_avg" in name:
+            # "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
+            # for the forward pass
+            continue
+
         var_data = checkpoint[name]
 
         if not "weight_v" in name:
@@ -49,7 +67,7 @@ def parse_model(checkpoint, outfile):
             # weight_v has its corresponding magnitude tensor to rescale the weights
             # of the convolutional layers. We parse both kinds of weights jointly to
             # build the final weight tensor of the convolution.
-            base_name = name.split(".")[:-1] 
+            base_name = name.split(".")[:-1]
             weight_g_name = ".".join(base_name + ["weight_g"])
             var_data_g = checkpoint[weight_g_name]
 
@@ -75,6 +93,8 @@ def parse_model(checkpoint, outfile):
 
         var_data.tofile(outfile)
 
+    outfile.close()
+
 
 if __name__ == "__main__":
     args = parser.parse_args()
@@ -84,12 +104,9 @@ def parse_model(checkpoint, outfile):
     out_dir = Path(args.out_dir)
     out_dir.mkdir(exist_ok=True, parents=True)
 
-    outfile = open(out_dir / "ggml-model.bin", "wb")
-    outfile.write(struct.pack("i", 0x67676d6c))  # ggml magic
+    outfile = Path(out_dir / "ggml-model.bin")
 
     checkpoint = torch.load(dir_model / "encodec_24khz-d7cc33bc.th", map_location="cpu")
-    parse_model(checkpoint, outfile)
-
-    outfile.close()
+    parse_codec_model(checkpoint, outfile)
 
     print("Done.")
diff --git a/encodec.cpp b/encodec.cpp
@@ -4,6 +4,7 @@
 #include <stdexcept>
 #include <fstream>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -139,7 +140,8 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
              struct ggml_tensor * weight_ih,
              struct ggml_tensor * weight_hh,
              struct ggml_tensor * bias_ih,
-             struct ggml_tensor * bias_hh) {
+             struct ggml_tensor * bias_hh,
+                           bool   is_measure) {
 
     const int input_dim  = inp->ne[1];
     const int hidden_dim = weight_ih->ne[1]/4;
@@ -150,8 +152,10 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
     struct ggml_tensor * c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
     struct ggml_tensor * h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
 
-    h_t = ggml_set_zero(h_t);
-    c_t = ggml_set_zero(c_t);
+    if (is_measure) {
+        h_t = ggml_set_zero(h_t);
+        c_t = ggml_set_zero(c_t);
+    }
 
     struct ggml_tensor * current = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
 
@@ -168,7 +172,7 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
 
         struct ggml_tensor * i_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0*sizeof(float)*hidden_dim));
         struct ggml_tensor * f_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1*sizeof(float)*hidden_dim));
-        struct ggml_tensor * g_t = ggml_tanh   (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
+        struct ggml_tensor * g_t = ggml_tanh      (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
         struct ggml_tensor * o_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3*sizeof(float)*hidden_dim));
 
         c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t));
@@ -207,7 +211,7 @@ static struct ggml_tensor * strided_conv_transpose_1d(
     return unpadded;
 }
 
-bool encodec_model_load(const std::string& fname, encodec_model& model) {
+bool encodec_load_model_weights(const std::string& fname, encodec_model& model) {
     fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
 
     auto infile = std::ifstream(fname, std::ios::binary);
@@ -459,15 +463,9 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {
             model.quantizer.blocks.resize(n_q);
 
             for (int i = 0; i < n_q; i++) {
-                model.quantizer.blocks[i].inited       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-                model.quantizer.blocks[i].cluster_size = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_bins);
                 model.quantizer.blocks[i].embed        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);
-                model.quantizer.blocks[i].embed_avg    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);
 
-                model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.inited"]       = model.quantizer.blocks[i].inited;
-                model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.cluster_size"] = model.quantizer.blocks[i].cluster_size;
                 model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed"]        = model.quantizer.blocks[i].embed;
-                model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed_avg"]    = model.quantizer.blocks[i].embed_avg;
             }
         }
 
@@ -529,7 +527,7 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {
 
             infile.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
 
-            // printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
 
             total_size += ggml_nbytes(tensor);
             model.n_loaded++;
@@ -548,7 +546,7 @@ static struct ggml_cgraph * encodec_build_graph(
             const std::vector<float> & inp_audio) {
     const int32_t audio_length = inp_audio.size();
 
-    const auto & model = ectx.model;
+    const auto & model = *ectx.model;
 
     struct ggml_init_params ggml_params = {
         /*.mem_size   =*/ ectx.buf_compute.size(),
@@ -617,11 +615,13 @@ static struct ggml_cgraph * encodec_build_graph(
 
             // first lstm layer
             struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
-                ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
+                ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b,
+                ggml_allocr_is_measure(ectx.allocr));
 
             // second lstm layer
             struct ggml_tensor * out = forward_pass_lstm_unilayer(
-                ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
+                ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b,
+                ggml_allocr_is_measure(ectx.allocr));
 
             inpL = ggml_add(ctx0, inpL, out);
         }
@@ -723,7 +723,8 @@ static struct ggml_cgraph * encodec_build_graph(
         const int stride        = hparams.stride;
 
         struct ggml_tensor * inpL = strided_conv_1d(
-            ctx0, quantized_out, model.decoder.init_conv_w, model.decoder.init_conv_b, stride);
+            ctx0, quantized_out, model.decoder.init_conv_w,
+            model.decoder.init_conv_b, stride);
 
         // lstm
         {
@@ -733,11 +734,13 @@ static struct ggml_cgraph * encodec_build_graph(
 
             // first lstm layer
             struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
-                ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
+                ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b,
+                ggml_allocr_is_measure(ectx.allocr));
 
             // second lstm layer
             struct ggml_tensor * out = forward_pass_lstm_unilayer(
-                ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
+                ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b,
+                ggml_allocr_is_measure(ectx.allocr));
 
             inpL = ggml_add(ctx0, inpL, out);
         }
@@ -794,7 +797,7 @@ static struct ggml_cgraph * encodec_build_graph(
     return gf;
 }
 
-bool encodec_model_eval(
+bool encodec_reconstruct_audio(
                    encodec_context & ectx,
                 std::vector<float> & raw_audio,
                                int   n_threads) {
@@ -855,18 +858,20 @@ bool encodec_model_eval(
     return true;
 }
 
-struct encodec_context encodec_new_context_with_model(encodec_model & model) {
-    encodec_context ctx = encodec_context(model);
-    return ctx;
-}
+std::shared_ptr<encodec_context> encodec_load_model(const std::string & model_path) {
+    int64_t t_start_load_us = ggml_time_us();
+
+    encodec_context ectx;
 
-struct encodec_model encodec_load_model_from_file(std::string fname) {
-    encodec_model model;
-    if (!encodec_model_load(fname, model)) {
-        fprintf(stderr, "%s: failed to load model\n", __func__);
-        exit(0);
+    ectx.model = std::make_unique<encodec_model>();
+    if (!encodec_load_model_weights(model_path, *ectx.model)) {
+        fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str());
+        return {};
     }
-    return model;
+
+    ectx.t_load_us = ggml_time_us() - t_start_load_us;
+
+    return std::make_unique<encodec_context>(std::move(ectx));
 }
 
 void encodec_free(encodec_context & ectx) {

diff --git a/encodec.h b/encodec.h
@@ -79,10 +79,7 @@ struct encodec_encoder {
 };
 
 struct encodec_quant_block {
-    struct ggml_tensor * inited;
-    struct ggml_tensor * cluster_size;
     struct ggml_tensor * embed;
-    struct ggml_tensor * embed_avg;
 };
 
 struct encodec_quantizer {
@@ -134,16 +131,7 @@ struct encodec_model {
 };
 
 struct encodec_context {
-    encodec_context(encodec_model & model) : model(model) {}
-
-    ~encodec_context() {
-        if (model_owner) {
-            delete &model;
-        }
-    }
-
-    encodec_model & model;
-    bool model_owner = false;
+    std::unique_ptr<encodec_model> model;
 
     struct ggml_context * ctx_audio;
     struct ggml_tensor  * reconstructed_audio;
@@ -158,15 +146,13 @@ struct encodec_context {
     struct ggml_allocr * allocr = {};
 
     // statistics
+    int64_t t_load_us    = 0;
     int64_t t_compute_ms = 0;
 };
 
+std::shared_ptr<encodec_context> encodec_load_model(const std::string & model_path);
 
-struct encodec_model encodec_load_model_from_file(std::string fname);
-
-struct encodec_context encodec_new_context_with_model(encodec_model & model);
-
-bool encodec_model_eval(
+bool encodec_reconstruct_audio(
                    encodec_context & ectx,
                 std::vector<float> & raw_audio,
                                int   n_threads);

diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
@@ -4,7 +4,7 @@ add_executable(${TARGET} main.cpp dr_wav.h)
 
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE encodec.cpp ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_14)
 
 if(MSVC)
     target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1)