From af02667f5eadbbba925a5dbd4a6ba6233435c164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=9Aled=C5=BA?= Date: Fri, 2 Aug 2024 19:23:13 +0200 Subject: [PATCH] Bring back support for older ffmpegs --- Makefile | 4 ++-- c_src/xav/channel_layout.h | 12 ++++++++++ c_src/xav/converter.c | 27 +++++++++++++++------ c_src/xav/converter.h | 10 ++++---- c_src/xav/decoder.c | 48 ++++++++++++++++++++++++++------------ c_src/xav/decoder.h | 2 +- c_src/xav/reader.c | 20 ++++++++++++++-- c_src/xav/utils.c | 20 ---------------- c_src/xav/utils.h | 1 - c_src/xav/xav_nif.c | 3 ++- test/decoder_test.exs | 2 +- test/reader_test.exs | 3 ++- 12 files changed, 97 insertions(+), 55 deletions(-) create mode 100644 c_src/xav/channel_layout.h diff --git a/Makefile b/Makefile index 4e2c93a..e36ad6f 100644 --- a/Makefile +++ b/Makefile @@ -8,9 +8,9 @@ PRIV_DIR = $(MIX_APP_PATH)/priv XAV_SO = $(PRIV_DIR)/libxav.so # uncomment to compile with debug logs -# XAV_DEBUG_LOGS = -DXAV_DEBUG=1 +XAV_DEBUG_LOGS = -DXAV_DEBUG=1 -HEADERS = $(XAV_DIR)/reader.h $(XAV_DIR)/decoder.h $(XAV_DIR)/converter.h $(XAV_DIR)/utils.h +HEADERS = $(XAV_DIR)/reader.h $(XAV_DIR)/decoder.h $(XAV_DIR)/converter.h $(XAV_DIR)/channel_layout.h $(XAV_DIR)/utils.h SOURCES = $(XAV_DIR)/xav_nif.c $(XAV_DIR)/reader.c $(XAV_DIR)/decoder.c $(XAV_DIR)/converter.c $(XAV_DIR)/utils.c CFLAGS = $(XAV_DEBUG_LOGS) -fPIC -shared diff --git a/c_src/xav/channel_layout.h b/c_src/xav/channel_layout.h new file mode 100644 index 0000000..5dd8685 --- /dev/null +++ b/c_src/xav/channel_layout.h @@ -0,0 +1,12 @@ +#ifndef CHANNEL_LAYOUT_H +#define CHANNEL_LAYOUT_H +#include + +struct ChannelLayout { +#if LIBAVUTIL_VERSION_MAJOR >= 58 + AVChannelLayout layout; +#else + uint64_t layout; +#endif +}; +#endif diff --git a/c_src/xav/converter.c b/c_src/xav/converter.c index cecaf99..a7564aa 100644 --- a/c_src/xav/converter.c +++ b/c_src/xav/converter.c @@ -5,10 +5,11 @@ #include #include +#include "channel_layout.h" #include "utils.h" -int converter_init(struct Converter *c, AVChannelLayout in_chlayout, int in_sample_rate, - enum AVSampleFormat in_sample_fmt, AVChannelLayout out_chlayout, +int converter_init(struct Converter *c, struct ChannelLayout in_chlayout, int in_sample_rate, + enum AVSampleFormat in_sample_fmt, struct ChannelLayout out_chlayout, int out_sample_rate, enum AVSampleFormat out_sample_fmt) { c->swr_ctx = swr_alloc(); c->in_sample_rate = in_sample_rate; @@ -16,8 +17,13 @@ int converter_init(struct Converter *c, AVChannelLayout in_chlayout, int in_samp c->out_chlayout = out_chlayout; c->out_sample_fmt = out_sample_fmt; - av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout, 0); - av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout, 0); +#if LIBAVUTIL_VERSION_MAJOR >= 58 + av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout.layout, 0); + av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout.layout, 0); +#else + av_opt_set_channel_layout(c->swr_ctx, "in_channel_layout", in_chlayout.layout, 0); + av_opt_set_channel_layout(c->swr_ctx, "out_channel_layout", out_chlayout.layout, 0); +#endif av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0); av_opt_set_int(c->swr_ctx, "out_sample_rate", out_sample_rate, 0); @@ -30,6 +36,13 @@ int converter_init(struct Converter *c, AVChannelLayout in_chlayout, int in_samp int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_data, int *out_samples, int *out_size) { + +#if LIBAVUTIL_VERSION_MAJOR >= 58 + int out_nb_channels = c->out_chlayout.layout.nb_channels; +#else + int out_nb_channels = av_get_channel_layout_nb_channels(c->out_chlayout.layout); +#endif + uint8_t **out_data_tmp = NULL; int max_out_nb_samples = swr_get_out_samples(c->swr_ctx, src_frame->nb_samples); int out_bytes_per_sample = av_get_bytes_per_sample(c->out_sample_fmt); @@ -38,7 +51,7 @@ int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_da // to use fast/aligned SIMD routines - this is what align option is used for. // See https://stackoverflow.com/questions/35678041/what-is-linesize-alignment-meaning // Because we return the binary straight to the Erlang, we can disable it. - int ret = av_samples_alloc_array_and_samples(&out_data_tmp, NULL, c->out_chlayout.nb_channels, + int ret = av_samples_alloc_array_and_samples(&out_data_tmp, NULL, out_nb_channels, max_out_nb_samples, c->out_sample_fmt, 1); if (ret < 0) { @@ -58,9 +71,9 @@ int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_da XAV_LOG_DEBUG("Converted %d samples per channel", *out_samples); - *out_size = *out_samples * out_bytes_per_sample * c->out_chlayout.nb_channels; + *out_size = *out_samples * out_bytes_per_sample * out_nb_channels; return 0; } -void converter_free(struct Converter *c) { swr_free(&c->swr_ctx); } \ No newline at end of file +void converter_free(struct Converter *c) { swr_free(&c->swr_ctx); } diff --git a/c_src/xav/converter.h b/c_src/xav/converter.h index f2d1ec2..4e99a5d 100644 --- a/c_src/xav/converter.h +++ b/c_src/xav/converter.h @@ -4,18 +4,20 @@ #include #include +#include "channel_layout.h" + struct Converter { SwrContext *swr_ctx; int64_t in_sample_rate; int64_t out_sample_rate; - AVChannelLayout out_chlayout; + struct ChannelLayout out_chlayout; enum AVSampleFormat out_sample_fmt; }; -int converter_init(struct Converter *c, AVChannelLayout in_chlayout, int in_sample_rate, - enum AVSampleFormat in_sample_fmt, AVChannelLayout out_chlaout, +int converter_init(struct Converter *c, struct ChannelLayout in_chlayout, int in_sample_rate, + enum AVSampleFormat in_sample_fmt, struct ChannelLayout out_chlaout, int out_sample_rate, enum AVSampleFormat out_sample_fmt); int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_data, int *out_samples, int *out_size); void converter_free(struct Converter *converter); -#endif \ No newline at end of file +#endif diff --git a/c_src/xav/decoder.c b/c_src/xav/decoder.c index 516c332..e466924 100644 --- a/c_src/xav/decoder.c +++ b/c_src/xav/decoder.c @@ -1,8 +1,11 @@ #include "decoder.h" #include "utils.h" +static int init_converter(struct Decoder *decoder); + int decoder_init(struct Decoder *decoder, const char *codec) { decoder->swr_ctx = NULL; + decoder->converter = NULL; decoder->out_data = NULL; if (strcmp(codec, "opus") == 0) { @@ -31,19 +34,6 @@ int decoder_init(struct Decoder *decoder, const char *codec) { return -1; } - if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { - AVChannelLayout out_chlayout = decoder->c->ch_layout; - int out_sample_rate = decoder->c->sample_rate; - enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; - - int ret = converter_init(&decoder->converter, decoder->c->ch_layout, decoder->c->sample_rate, - decoder->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); - - if (ret < 0) { - return ret; - } - } - return 0; } @@ -74,7 +64,15 @@ int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame) { decoder->frame_linesize = frame->linesize; } } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { - return converter_convert(&decoder->converter, frame, &decoder->out_data, &decoder->out_samples, + + if (decoder->converter == NULL) { + ret = init_converter(decoder); + if (ret < 0) { + return ret; + } + } + + return converter_convert(decoder->converter, frame, &decoder->out_data, &decoder->out_samples, &decoder->out_size); } @@ -89,4 +87,24 @@ void decoder_free(struct Decoder *decoder) { if (decoder->c != NULL) { avcodec_free_context(&decoder->c); } -} \ No newline at end of file +} + +static int init_converter(struct Decoder *decoder) { + decoder->converter = (struct Converter *)calloc(1, sizeof(struct Converter)); + int out_sample_rate = decoder->c->sample_rate; + enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + + struct ChannelLayout in_chlayout, out_chlayout; +#if LIBAVUTIL_VERSION_MAJOR >= 58 + in_chlayout.layout = decoder->c->ch_layout; + out_chlayout.layout = decoder->c->ch_layout; +#else + in_chlayout.layout = decoder->c->channel_layout; + out_chlayout.layout = decoder->c->channel_layout; + XAV_LOG_DEBUG("in_chlayout %ld", in_chlayout.layout); + XAV_LOG_DEBUG("in nb_channels %d", av_get_channel_layout_nb_channels(in_chlayout.layout)); +#endif + + return converter_init(decoder->converter, in_chlayout, decoder->c->sample_rate, + decoder->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); +} diff --git a/c_src/xav/decoder.h b/c_src/xav/decoder.h index 20a5d58..f6710e9 100644 --- a/c_src/xav/decoder.h +++ b/c_src/xav/decoder.h @@ -17,7 +17,7 @@ struct Decoder { uint8_t **frame_data; int *frame_linesize; - struct Converter converter; + struct Converter *converter; // Buffer where audio samples are written after conversion. // We always convet to packed format, so only out_data[0] is set. uint8_t **out_data; diff --git a/c_src/xav/reader.c b/c_src/xav/reader.c index 61b06e5..e5d32da 100644 --- a/c_src/xav/reader.c +++ b/c_src/xav/reader.c @@ -3,6 +3,8 @@ #include #include +#include + int reader_init(struct Reader *reader, unsigned char *path, size_t path_size, int device_flag, enum AVMediaType media_type) { int ret; @@ -71,11 +73,24 @@ int reader_init(struct Reader *reader, unsigned char *path, size_t path_size, in } if (reader->media_type == AVMEDIA_TYPE_AUDIO) { - AVChannelLayout out_chlayout = AV_CHANNEL_LAYOUT_MONO; int out_sample_rate = 16000; enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; - int ret = converter_init(&reader->converter, reader->c->ch_layout, reader->c->sample_rate, + struct ChannelLayout in_chlayout, out_chlayout; +#if LIBAVUTIL_VERSION_MAJOR >= 58 + XAV_LOG_DEBUG("in nb_channels %d", reader->c->ch_layout.nb_channels); + in_chlayout.layout = reader->c->ch_layout; + av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO); +#else + in_chlayout.layout = reader->c->channel_layout; + out_chlayout.layout = AV_CH_LAYOUT_MONO; + + XAV_LOG_DEBUG("in_chlayout %ld", in_chlayout.layout); + printf("uint64_t %" PRIu64 "\n", t); + XAV_LOG_DEBUG("in nb_channels %d", av_get_channel_layout_nb_channels(in_chlayout.layout)); +#endif + + int ret = converter_init(&reader->converter, in_chlayout, reader->c->sample_rate, reader->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); if (ret < 0) { @@ -211,6 +226,7 @@ void reader_free_frame(struct Reader *reader) { if (reader->out_data != NULL) { free(reader->out_data); + reader->out_data = NULL; } } diff --git a/c_src/xav/utils.c b/c_src/xav/utils.c index 78610d5..87772cd 100644 --- a/c_src/xav/utils.c +++ b/c_src/xav/utils.c @@ -13,26 +13,6 @@ void print_supported_pix_fmts(AVCodec *codec) { } } -int init_swr_ctx_from_frame(SwrContext **swr_ctx, AVFrame *frame) { - *swr_ctx = swr_alloc(); - enum AVSampleFormat out_sample_fmt = av_get_alt_sample_fmt(frame->format, 0); - -#if LIBAVUTIL_VERSION_MAJOR >= 58 - av_opt_set_chlayout(*swr_ctx, "in_chlayout", &frame->ch_layout, 0); - av_opt_set_chlayout(*swr_ctx, "out_chlayout", &frame->ch_layout, 0); -#else - av_opt_set_channel_layout(*swr_ctx, "in_channel_layout", frame->channel_layout, 0); - av_opt_set_channel_layout(*swr_ctx, "out_channel_layout", frame->channel_layout, 0); -#endif - - av_opt_set_int(*swr_ctx, "in_sample_rate", frame->sample_rate, 0); - av_opt_set_int(*swr_ctx, "out_sample_rate", frame->sample_rate, 0); - av_opt_set_sample_fmt(*swr_ctx, "in_sample_fmt", frame->format, 0); - av_opt_set_sample_fmt(*swr_ctx, "out_sample_fmt", out_sample_fmt, 0); - - return swr_init(*swr_ctx); -} - void convert_to_rgb(AVFrame *src_frame, uint8_t *dst_data[], int dst_linesize[]) { struct SwsContext *sws_ctx = sws_getContext(src_frame->width, src_frame->height, src_frame->format, src_frame->width, diff --git a/c_src/xav/utils.h b/c_src/xav/utils.h index 7339cda..72e21dd 100644 --- a/c_src/xav/utils.h +++ b/c_src/xav/utils.h @@ -18,7 +18,6 @@ #define XAV_FREE(X) enif_free(X) void print_supported_pix_fmts(AVCodec *codec); -int init_swr_ctx_from_frame(SwrContext **swr_ctx, AVFrame *frame); void convert_to_rgb(AVFrame *src_frame, uint8_t *dst_data[], int dst_linesize[]); ERL_NIF_TERM xav_nif_ok(ErlNifEnv *env, ERL_NIF_TERM data_term); diff --git a/c_src/xav/xav_nif.c b/c_src/xav/xav_nif.c index 7f52e96..81cc63d 100644 --- a/c_src/xav/xav_nif.c +++ b/c_src/xav/xav_nif.c @@ -190,7 +190,7 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { decoder->frame_linesize, "rgb"); } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { - const char *out_format = av_get_sample_fmt_name(decoder->converter.out_sample_fmt); + const char *out_format = av_get_sample_fmt_name(decoder->converter->out_sample_fmt); frame_term = xav_nif_audio_frame_to_term(env, decoder->out_data, decoder->out_samples, decoder->out_size, out_format, frame->pts); @@ -204,6 +204,7 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { if (decoder->out_data != NULL) { free(decoder->out_data); + decoder->out_data = NULL; } return term; diff --git a/test/decoder_test.exs b/test/decoder_test.exs index f4163ad..df8fc4d 100644 --- a/test/decoder_test.exs +++ b/test/decoder_test.exs @@ -1,5 +1,5 @@ defmodule Xav.DecoderTest do - use ExUnit.Case, async: true + use ExUnit.Case, async: false @vp8_keyframe <<80, 188, 0, 157, 1, 42, 128, 2, 224, 1, 57, 107, 0, 47, 28, 34, 22, 22, 34, 102, 18, 32, 212, 14, 239, 198, 191, 249, 103, 67, 12, 209, 59, 136, 119, 231, 148, diff --git a/test/reader_test.exs b/test/reader_test.exs index 2dd441c..1a7f948 100644 --- a/test/reader_test.exs +++ b/test/reader_test.exs @@ -1,5 +1,5 @@ defmodule Xav.ReaderTest do - use ExUnit.Case, async: true + use ExUnit.Case, async: false test "new/1" do assert {:ok, %Xav.Reader{}} = Xav.Reader.new("./test/fixtures/sample_h264.mp4") @@ -41,6 +41,7 @@ defmodule Xav.ReaderTest do end end) + @tag :debug test "speech to text" do for {path, expected_output} <- [ # This file has been downloaded from https://audio-samples.github.io/