fix non-monotonic increase audio frames issue (livepeer#281)

The issue was solved by making the monotonic audio AVpacket's DTS like the [pipeline ](https://github.com/livepeer/FFmpeg/blob/682c4189d8364867bcc49f9749e04b27dc37cded/fftools/ffmpeg.c#L824)of ffmpeg.
Video-Miner · Dec 30, 2021 · e540970 · e540970
1 parent c6b109e
commit e540970
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 0 deletions.
diff --git a/data/duplicate-audio-dts.ts b/data/duplicate-audio-dts.ts
diff --git a/data/sign_nv1.bin b/data/sign_nv1.bin
diff --git a/data/sign_nv2.bin b/data/sign_nv2.bin
diff --git a/data/sign_sw1.bin b/data/sign_sw1.bin
diff --git a/data/sign_sw2.bin b/data/sign_sw2.bin
diff --git a/ffmpeg/api_test.go b/ffmpeg/api_test.go
@@ -1362,6 +1362,41 @@ func TestTranscoder_NoKeyframe(t *testing.T) {
 	noKeyframeSegment(t, Software)
 }
 
+func nonMonotonicAudioSegment(t *testing.T, accel Acceleration) {
+	run, dir := setupTest(t)
+	defer os.RemoveAll(dir)
+
+	cmd := `
+    cp "$1"/../data/duplicate-audio-dts.ts .
+
+    # verify dts non-monotonic audio frame in duplicate-audio-dts.ts
+    ffprobe -select_streams a -show_streams -show_packets duplicate-audio-dts.ts | grep dts_time=98.127522 | wc -l | grep 2
+  `
+	run(cmd)
+
+	tc := NewTranscoder()
+	prof := P144p30fps16x9
+
+	in := &TranscodeOptionsIn{
+		Fname: fmt.Sprintf("%s/duplicate-audio-dts.ts", dir),
+		Accel: accel,
+	}
+	out := []TranscodeOptions{{
+		Oname:   fmt.Sprintf("%s/out-dts.ts", dir),
+		Profile: prof,
+		Accel:   accel,
+	}}
+	_, err := tc.Transcode(in, out)
+	if err != nil {
+		t.Error("Expected to succeed for a segment with non-monotonic audio frame but did not")
+	}
+
+	tc.StopTranscoder()
+}
+func TestTranscoder_NonMonotonicAudioSegment(t *testing.T) {
+	nonMonotonicAudioSegment(t, Software)
+}
+
 /*
 func detectionFreq(t *testing.T, accel Acceleration) {
 	run, dir := setupTest(t)

diff --git a/ffmpeg/encoder.c b/ffmpeg/encoder.c
@@ -87,6 +87,9 @@ static int add_audio_stream(struct input_ctx *ictx, struct output_ctx *octx)
 
   // signal whether to drop preroll audio
   if (st->codecpar->initial_padding) octx->drop_ts = AV_NOPTS_VALUE;
+
+  octx->last_audio_dts = AV_NOPTS_VALUE;
+
   return 0;
 
 add_audio_err:
@@ -376,6 +379,24 @@ int mux(AVPacket *pkt, AVRational tb, struct output_ctx *octx, AVStream *ost)
   if (AVMEDIA_TYPE_AUDIO == ost->codecpar->codec_type) {
       if (octx->drop_ts == AV_NOPTS_VALUE) octx->drop_ts = pkt->pts;
       if (pkt->pts && pkt->pts == octx->drop_ts) return 0;
+
+      if (pkt->dts != AV_NOPTS_VALUE && pkt->pts != AV_NOPTS_VALUE && pkt->dts > pkt->pts) {
+        pkt->pts = pkt->dts = pkt->pts + pkt->dts + octx->last_audio_dts + 1
+                     - FFMIN3(pkt->pts, pkt->dts, octx->last_audio_dts + 1)
+                     - FFMAX3(pkt->pts, pkt->dts, octx->last_audio_dts + 1);
+      }
+      /*https://github.com/livepeer/FFmpeg/blob/682c4189d8364867bcc49f9749e04b27dc37cded/fftools/ffmpeg.c#L824*/
+      if (pkt->dts != AV_NOPTS_VALUE && octx->last_audio_dts != AV_NOPTS_VALUE) {
+        /*If the out video format does not require strictly increasing timestamps,
+        but they must still be monotonic, then let set max timestamp as octx->last_audio_dts+1.*/
+        int64_t max = octx->last_audio_dts + !(octx->oc->oformat->flags & AVFMT_TS_NONSTRICT);
+        // check if dts is bigger than previous last dts or not, not then that's non-monotonic
+        if (pkt->dts < max) {
+          if (pkt->pts >= pkt->dts) pkt->pts = FFMAX(pkt->pts, max);
+          pkt->dts = max;
+        }
+      }
+      octx->last_audio_dts = pkt->dts;
   }
 
   return av_interleaved_write_frame(octx->oc, pkt);

diff --git a/ffmpeg/filter.h b/ffmpeg/filter.h
@@ -59,6 +59,8 @@ struct output_ctx {
 
   int64_t drop_ts;     // preroll audio ts to drop
 
+  int64_t last_audio_dts;     //dts of the last audio packet sent to the muxer
+
   int64_t gop_time, gop_pts_len, next_kf_pts; // for gop reset
 
   int64_t clip_from, clip_to, clip_from_pts, clip_to_pts, clip_started, clip_start_pts, clip_start_pts_found; // for clipping

diff --git a/ffmpeg/nvidia_test.go b/ffmpeg/nvidia_test.go
@@ -796,6 +796,10 @@ func TestNvidia_NoKeyframe(t *testing.T) {
 	noKeyframeSegment(t, Nvidia)
 }
 
+func TestNvidia_NonMonotonicAudioSegment(t *testing.T) {
+	nonMonotonicAudioSegment(t, Nvidia)
+}
+
 /*
 func TestNvidia_DetectionFreq(t *testing.T) {
 	detectionFreq(t, Nvidia)