Skip to content

Commit

Permalink
fix: encodec forward pass (#9)
Browse files Browse the repository at this point in the history
  • Loading branch information
PABannier authored Oct 5, 2023
1 parent 8f5c964 commit e59b55d
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 83 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ endif()

target_link_libraries(${ENCODEC_LIB} PUBLIC ggml)
target_include_directories(${ENCODEC_LIB} PUBLIC .)
target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_11)
target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_14)
33 changes: 25 additions & 8 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,23 @@
- Name (char[name_length])
- Data (float[n_dims])
NOTE
Note
----
Encodec uses weight normalization for its convolutional layers. All the weights are
decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple
call to the hook torch._weight_norm allows to get the final weight tensor of the
convolution from weight_v and weight_g. To drastically reduce the number of operations
at inference time, the ggml weights file only contain the final convolution weights but
does not store the decomposition into weight_v and weight_g.
Usage
-----
```bash
python convert.py \
--dir-model ./ggml_weights/ \
--out-dir ./ggml_weights/
```
"""
import argparse
from pathlib import Path
Expand All @@ -32,14 +41,23 @@
parser.add_argument("--out-dir", type=str, required=True)


def parse_model(checkpoint, outfile):
def parse_codec_model(checkpoint, out_dir):
"""Load encodec model checkpoint."""
outfile = open(out_dir, "wb")
outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic

for name in checkpoint.keys():
if "weight_g" in name:
# the tensor has already been parsed with the corresponding "weight_v"
# tensor to form the final weights tensor of the convolution, therefore
# we skip it
continue

if "inited" in name or "cluster_size" in name or "embed_avg" in name:
# "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
# for the forward pass
continue

var_data = checkpoint[name]

if not "weight_v" in name:
Expand All @@ -49,7 +67,7 @@ def parse_model(checkpoint, outfile):
# weight_v has its corresponding magnitude tensor to rescale the weights
# of the convolutional layers. We parse both kinds of weights jointly to
# build the final weight tensor of the convolution.
base_name = name.split(".")[:-1]
base_name = name.split(".")[:-1]
weight_g_name = ".".join(base_name + ["weight_g"])
var_data_g = checkpoint[weight_g_name]

Expand All @@ -75,6 +93,8 @@ def parse_model(checkpoint, outfile):

var_data.tofile(outfile)

outfile.close()


if __name__ == "__main__":
args = parser.parse_args()
Expand All @@ -84,12 +104,9 @@ def parse_model(checkpoint, outfile):
out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True, parents=True)

outfile = open(out_dir / "ggml-model.bin", "wb")
outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic
outfile = Path(out_dir / "ggml-model.bin")

checkpoint = torch.load(dir_model / "encodec_24khz-d7cc33bc.th", map_location="cpu")
parse_model(checkpoint, outfile)

outfile.close()
parse_codec_model(checkpoint, outfile)

print("Done.")
63 changes: 34 additions & 29 deletions encodec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <stdexcept>
#include <fstream>
#include <map>
#include <memory>
#include <string>
#include <vector>

Expand Down Expand Up @@ -139,7 +140,8 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
struct ggml_tensor * weight_ih,
struct ggml_tensor * weight_hh,
struct ggml_tensor * bias_ih,
struct ggml_tensor * bias_hh) {
struct ggml_tensor * bias_hh,
bool is_measure) {

const int input_dim = inp->ne[1];
const int hidden_dim = weight_ih->ne[1]/4;
Expand All @@ -150,8 +152,10 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
struct ggml_tensor * c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
struct ggml_tensor * h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);

h_t = ggml_set_zero(h_t);
c_t = ggml_set_zero(c_t);
if (is_measure) {
h_t = ggml_set_zero(h_t);
c_t = ggml_set_zero(c_t);
}

struct ggml_tensor * current = ggml_cont(ctx0, ggml_transpose(ctx0, inp));

Expand All @@ -168,7 +172,7 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(

struct ggml_tensor * i_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0*sizeof(float)*hidden_dim));
struct ggml_tensor * f_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1*sizeof(float)*hidden_dim));
struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
struct ggml_tensor * o_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3*sizeof(float)*hidden_dim));

c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t));
Expand Down Expand Up @@ -207,7 +211,7 @@ static struct ggml_tensor * strided_conv_transpose_1d(
return unpadded;
}

bool encodec_model_load(const std::string& fname, encodec_model& model) {
bool encodec_load_model_weights(const std::string& fname, encodec_model& model) {
fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());

auto infile = std::ifstream(fname, std::ios::binary);
Expand Down Expand Up @@ -459,15 +463,9 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {
model.quantizer.blocks.resize(n_q);

for (int i = 0; i < n_q; i++) {
model.quantizer.blocks[i].inited = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
model.quantizer.blocks[i].cluster_size = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_bins);
model.quantizer.blocks[i].embed = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);
model.quantizer.blocks[i].embed_avg = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);

model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.inited"] = model.quantizer.blocks[i].inited;
model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.cluster_size"] = model.quantizer.blocks[i].cluster_size;
model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed"] = model.quantizer.blocks[i].embed;
model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed_avg"] = model.quantizer.blocks[i].embed_avg;
}
}

Expand Down Expand Up @@ -529,7 +527,7 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {

infile.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

// printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);

total_size += ggml_nbytes(tensor);
model.n_loaded++;
Expand All @@ -548,7 +546,7 @@ static struct ggml_cgraph * encodec_build_graph(
const std::vector<float> & inp_audio) {
const int32_t audio_length = inp_audio.size();

const auto & model = ectx.model;
const auto & model = *ectx.model;

struct ggml_init_params ggml_params = {
/*.mem_size =*/ ectx.buf_compute.size(),
Expand Down Expand Up @@ -617,11 +615,13 @@ static struct ggml_cgraph * encodec_build_graph(

// first lstm layer
struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b,
ggml_allocr_is_measure(ectx.allocr));

// second lstm layer
struct ggml_tensor * out = forward_pass_lstm_unilayer(
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b,
ggml_allocr_is_measure(ectx.allocr));

inpL = ggml_add(ctx0, inpL, out);
}
Expand Down Expand Up @@ -723,7 +723,8 @@ static struct ggml_cgraph * encodec_build_graph(
const int stride = hparams.stride;

struct ggml_tensor * inpL = strided_conv_1d(
ctx0, quantized_out, model.decoder.init_conv_w, model.decoder.init_conv_b, stride);
ctx0, quantized_out, model.decoder.init_conv_w,
model.decoder.init_conv_b, stride);

// lstm
{
Expand All @@ -733,11 +734,13 @@ static struct ggml_cgraph * encodec_build_graph(

// first lstm layer
struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b,
ggml_allocr_is_measure(ectx.allocr));

// second lstm layer
struct ggml_tensor * out = forward_pass_lstm_unilayer(
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b,
ggml_allocr_is_measure(ectx.allocr));

inpL = ggml_add(ctx0, inpL, out);
}
Expand Down Expand Up @@ -794,7 +797,7 @@ static struct ggml_cgraph * encodec_build_graph(
return gf;
}

bool encodec_model_eval(
bool encodec_reconstruct_audio(
encodec_context & ectx,
std::vector<float> & raw_audio,
int n_threads) {
Expand Down Expand Up @@ -855,18 +858,20 @@ bool encodec_model_eval(
return true;
}

struct encodec_context encodec_new_context_with_model(encodec_model & model) {
encodec_context ctx = encodec_context(model);
return ctx;
}
std::shared_ptr<encodec_context> encodec_load_model(const std::string & model_path) {
int64_t t_start_load_us = ggml_time_us();

encodec_context ectx;

struct encodec_model encodec_load_model_from_file(std::string fname) {
encodec_model model;
if (!encodec_model_load(fname, model)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
exit(0);
ectx.model = std::make_unique<encodec_model>();
if (!encodec_load_model_weights(model_path, *ectx.model)) {
fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str());
return {};
}
return model;

ectx.t_load_us = ggml_time_us() - t_start_load_us;

return std::make_unique<encodec_context>(std::move(ectx));
}

void encodec_free(encodec_context & ectx) {
Expand Down
22 changes: 4 additions & 18 deletions encodec.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,7 @@ struct encodec_encoder {
};

struct encodec_quant_block {
struct ggml_tensor * inited;
struct ggml_tensor * cluster_size;
struct ggml_tensor * embed;
struct ggml_tensor * embed_avg;
};

struct encodec_quantizer {
Expand Down Expand Up @@ -134,16 +131,7 @@ struct encodec_model {
};

struct encodec_context {
encodec_context(encodec_model & model) : model(model) {}

~encodec_context() {
if (model_owner) {
delete &model;
}
}

encodec_model & model;
bool model_owner = false;
std::unique_ptr<encodec_model> model;

struct ggml_context * ctx_audio;
struct ggml_tensor * reconstructed_audio;
Expand All @@ -158,15 +146,13 @@ struct encodec_context {
struct ggml_allocr * allocr = {};

// statistics
int64_t t_load_us = 0;
int64_t t_compute_ms = 0;
};

std::shared_ptr<encodec_context> encodec_load_model(const std::string & model_path);

struct encodec_model encodec_load_model_from_file(std::string fname);

struct encodec_context encodec_new_context_with_model(encodec_model & model);

bool encodec_model_eval(
bool encodec_reconstruct_audio(
encodec_context & ectx,
std::vector<float> & raw_audio,
int n_threads);
Expand Down
2 changes: 1 addition & 1 deletion examples/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ add_executable(${TARGET} main.cpp dr_wav.h)

install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE encodec.cpp ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_14)

if(MSVC)
target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1)
Expand Down
Loading

0 comments on commit e59b55d

Please sign in to comment.