FFmpeg
dialoguenhance_template.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #include "libavutil/tx.h"
20 #include "avfilter.h"
21 #include "internal.h"
22 #include "audio.h"
23 
24 #undef ctype
25 #undef ftype
26 #undef SQRT
27 #undef HYPOT
28 #undef SAMPLE_FORMAT
29 #undef TX_TYPE
30 #undef ONE
31 #undef ZERO
32 #undef HALF
33 #undef SIN
34 #undef CLIP
35 #undef EPSILON
36 #if DEPTH == 32
37 #define SAMPLE_FORMAT float
38 #define SQRT sqrtf
39 #define HYPOT hypotf
40 #define ctype AVComplexFloat
41 #define ftype float
42 #define TX_TYPE AV_TX_FLOAT_RDFT
43 #define ONE 1.f
44 #define ZERO 0.f
45 #define HALF 0.5f
46 #define SIN sinf
47 #define CLIP av_clipf
48 #define EPSILON FLT_EPSILON
49 #else
50 #define SAMPLE_FORMAT double
51 #define SQRT sqrt
52 #define HYPOT hypot
53 #define ctype AVComplexDouble
54 #define ftype double
55 #define TX_TYPE AV_TX_DOUBLE_RDFT
56 #define ONE 1.0
57 #define ZERO 0.0
58 #define HALF 0.5
59 #define SIN sin
60 #define CLIP av_clipd
61 #define EPSILON DBL_EPSILON
62 #endif
63 
64 #define fn3(a,b) a##_##b
65 #define fn2(a,b) fn3(a,b)
66 #define fn(a) fn2(a, SAMPLE_FORMAT)
67 
69 {
70  AudioDialogueEnhanceContext *s = ctx->priv;
71  ftype scale = ONE, iscale = ONE / (s->fft_size * 1.5f);
72  int ret;
73 
74  s->window = av_calloc(s->fft_size, sizeof(ftype));
75  if (!s->window)
76  return AVERROR(ENOMEM);
77  fn(s->window) = s->window;
78  for (int n = 0; n < s->fft_size; n++)
79  fn(s->window)[n] = SIN(M_PI*n/(s->fft_size-1));
80 
81  ret = av_tx_init(&s->tx_ctx[0], &s->tx_fn, TX_TYPE, 0, s->fft_size, &scale, 0);
82  if (ret < 0)
83  return ret;
84 
85  ret = av_tx_init(&s->tx_ctx[1], &s->tx_fn, TX_TYPE, 0, s->fft_size, &scale, 0);
86  if (ret < 0)
87  return ret;
88 
89  ret = av_tx_init(&s->itx_ctx, &s->itx_fn, TX_TYPE, 1, s->fft_size, &iscale, 0);
90  if (ret < 0)
91  return ret;
92 
93  return 0;
94 }
95 
96 static void fn(apply_window)(AudioDialogueEnhanceContext *s,
97  const ftype *in_frame, ftype *out_frame, const int add_to_out_frame)
98 {
99  const ftype *window = fn(s->window);
100  const int fft_size = s->fft_size;
101 
102  if (add_to_out_frame) {
103  for (int i = 0; i < fft_size; i++)
104  out_frame[i] += in_frame[i] * window[i];
105  } else {
106  for (int i = 0; i < fft_size; i++)
107  out_frame[i] = in_frame[i] * window[i];
108  }
109 }
110 
111 static ftype fn(sqr)(ftype x)
112 {
113  return x * x;
114 }
115 
116 static void fn(get_centere)(ctype *left, ctype *right,
117  ctype *center, int N)
118 {
119  for (int i = 0; i < N; i++) {
120  const ftype l_re = left[i].re;
121  const ftype l_im = left[i].im;
122  const ftype r_re = right[i].re;
123  const ftype r_im = right[i].im;
124  const ftype a = HALF * (ONE - SQRT((fn(sqr)(l_re - r_re) + fn(sqr)(l_im - r_im))/
125  (fn(sqr)(l_re + r_re) + fn(sqr)(l_im + r_im) + EPSILON)));
126 
127  center[i].re = a * (l_re + r_re);
128  center[i].im = a * (l_im + r_im);
129  }
130 }
131 
132 static ftype fn(flux)(ftype *curf, ftype *prevf, int N)
133 {
134  ctype *cur = (ctype *)curf;
135  ctype *prev = (ctype *)prevf;
136  ftype sum = ZERO;
137 
138  for (int i = 0; i < N; i++) {
139  ftype c_re = cur[i].re;
140  ftype c_im = cur[i].im;
141  ftype p_re = prev[i].re;
142  ftype p_im = prev[i].im;
143 
144  sum += fn(sqr)(HYPOT(c_re, c_im) - HYPOT(p_re, p_im));
145  }
146 
147  return sum;
148 }
149 
150 static ftype fn(fluxlr)(ftype *lf, ftype *lpf,
151  ftype *rf, ftype *rpf,
152  int N)
153 {
154  ctype *l = (ctype *)lf;
155  ctype *lp = (ctype *)lpf;
156  ctype *r = (ctype *)rf;
157  ctype *rp = (ctype *)rpf;
158  ftype sum = ZERO;
159 
160  for (int i = 0; i < N; i++) {
161  ftype c_re = l[i].re - r[i].re;
162  ftype c_im = l[i].im - r[i].im;
163  ftype p_re = lp[i].re - rp[i].re;
164  ftype p_im = lp[i].im - rp[i].im;
165 
166  sum += fn(sqr)(HYPOT(c_re, c_im) - HYPOT(p_re, p_im));
167  }
168 
169  return sum;
170 }
171 
173 {
174  const ftype vad = a * (fc / (fc + flr) - HALF);
175 
176  return CLIP(vad, ZERO, ONE);
177 }
178 
179 static void fn(get_final)(ftype *c, ftype *l,
180  ftype *r, ftype vad, int N,
181  ftype original, ftype enhance)
182 {
183  ctype *center = (ctype *)c;
184  ctype *left = (ctype *)l;
185  ctype *right = (ctype *)r;
186 
187  for (int i = 0; i < N; i++) {
188  ftype cP = fn(sqr)(center[i].re) + fn(sqr)(center[i].im);
189  ftype lrP = fn(sqr)(left[i].re - right[i].re) + fn(sqr)(left[i].im - right[i].im);
190  ftype G = cP / (cP + lrP + EPSILON);
191  ftype re, im;
192 
193  re = center[i].re * (original + vad * G * enhance);
194  im = center[i].im * (original + vad * G * enhance);
195 
196  center[i].re = re;
197  center[i].im = im;
198  }
199 }
200 
202 {
203  AudioDialogueEnhanceContext *s = ctx->priv;
204  ftype *center = (ftype *)s->center_frame->extended_data[0];
205  ftype *center_prev = (ftype *)s->center_frame->extended_data[1];
206  ftype *left_in = (ftype *)s->in_frame->extended_data[0];
207  ftype *right_in = (ftype *)s->in_frame->extended_data[1];
208  ftype *left_out = (ftype *)s->out_dist_frame->extended_data[0];
209  ftype *right_out = (ftype *)s->out_dist_frame->extended_data[1];
210  ftype *left_samples = (ftype *)s->in->extended_data[0];
211  ftype *right_samples = (ftype *)s->in->extended_data[1];
212  ftype *windowed_left = (ftype *)s->windowed_frame->extended_data[0];
213  ftype *windowed_right = (ftype *)s->windowed_frame->extended_data[1];
214  ftype *windowed_oleft = (ftype *)s->windowed_out->extended_data[0];
215  ftype *windowed_oright = (ftype *)s->windowed_out->extended_data[1];
216  ftype *windowed_pleft = (ftype *)s->windowed_prev->extended_data[0];
217  ftype *windowed_pright = (ftype *)s->windowed_prev->extended_data[1];
218  ftype *left_osamples = (ftype *)out->extended_data[0];
219  ftype *right_osamples = (ftype *)out->extended_data[1];
220  ftype *center_osamples = (ftype *)out->extended_data[2];
221  const int overlap = s->overlap;
222  const int offset = s->fft_size - overlap;
223  const int nb_samples = FFMIN(overlap, s->in->nb_samples);
224  ftype vad;
225 
226  // shift in/out buffers
227  memmove(left_in, &left_in[overlap], offset * sizeof(ftype));
228  memmove(right_in, &right_in[overlap], offset * sizeof(ftype));
229  memmove(left_out, &left_out[overlap], offset * sizeof(ftype));
230  memmove(right_out, &right_out[overlap], offset * sizeof(ftype));
231 
232  memcpy(&left_in[offset], left_samples, nb_samples * sizeof(ftype));
233  memcpy(&right_in[offset], right_samples, nb_samples * sizeof(ftype));
234  memset(&left_out[offset], 0, overlap * sizeof(ftype));
235  memset(&right_out[offset], 0, overlap * sizeof(ftype));
236 
237  fn(apply_window)(s, left_in, windowed_left, 0);
238  fn(apply_window)(s, right_in, windowed_right, 0);
239 
240  s->tx_fn(s->tx_ctx[0], windowed_oleft, windowed_left, sizeof(ftype));
241  s->tx_fn(s->tx_ctx[1], windowed_oright, windowed_right, sizeof(ftype));
242 
243  fn(get_centere)((ctype *)windowed_oleft,
244  (ctype *)windowed_oright,
245  (ctype *)center,
246  s->fft_size / 2 + 1);
247 
248  vad = fn(calc_vad)(fn(flux)(center, center_prev, s->fft_size / 2 + 1),
249  fn(fluxlr)(windowed_oleft, windowed_pleft,
250  windowed_oright, windowed_pright, s->fft_size / 2 + 1), s->voice);
251  vad = vad * 0.1 + 0.9 * fn(s->prev_vad);
252  fn(s->prev_vad) = vad;
253 
254  memcpy(center_prev, center, s->fft_size * sizeof(ftype));
255  memcpy(windowed_pleft, windowed_oleft, s->fft_size * sizeof(ftype));
256  memcpy(windowed_pright, windowed_oright, s->fft_size * sizeof(ftype));
257 
258  fn(get_final)(center, windowed_oleft, windowed_oright, vad, s->fft_size / 2 + 1,
259  s->original, s->enhance);
260 
261  s->itx_fn(s->itx_ctx, windowed_oleft, center, sizeof(ctype));
262 
263  fn(apply_window)(s, windowed_oleft, left_out, 1);
264 
265  memcpy(left_osamples, left_in, overlap * sizeof(ftype));
266  memcpy(right_osamples, right_in, overlap * sizeof(ftype));
267 
268  if (ctx->is_disabled)
269  memset(center_osamples, 0, overlap * sizeof(ftype));
270  else
271  memcpy(center_osamples, left_out, overlap * sizeof(ftype));
272 
273  return 0;
274 }
r
const char * r
Definition: vf_curves.c:126
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
fluxlr
static ftype fn() fluxlr(ftype *lf, ftype *lpf, ftype *rf, ftype *rpf, int N)
Definition: dialoguenhance_template.c:150
out
FILE * out
Definition: movenc.c:54
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:344
ZERO
#define ZERO
Definition: dialoguenhance_template.c:57
fn
#define fn(a)
Definition: dialoguenhance_template.c:66
fc
#define fc(width, name, range_min, range_max)
Definition: cbs_av1.c:472
CLIP
#define CLIP
Definition: dialoguenhance_template.c:60
av_tx_init
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags)
Initialize a transform context with the given configuration (i)MDCTs with an odd length are currently...
Definition: tx.c:902
HYPOT
#define HYPOT
Definition: dialoguenhance_template.c:52
lpf
static float * lpf(float Fn, float Fc, float tbw, int *num_taps, float att, float *beta, int round)
Definition: asrc_sinc.c:161
window
static SDL_Window * window
Definition: ffplay.c:364
calc_vad
static ftype fn() calc_vad(ftype fc, ftype flr, ftype a)
Definition: dialoguenhance_template.c:172
sqr
static ftype fn() sqr(ftype x)
Definition: dialoguenhance_template.c:111
ftype
#define ftype
Definition: dialoguenhance_template.c:54
HALF
#define HALF
Definition: dialoguenhance_template.c:58
s
#define s(width, name)
Definition: cbs_vp9.c:198
get_final
static void fn() get_final(ftype *c, ftype *l, ftype *r, ftype vad, int N, ftype original, ftype enhance)
Definition: dialoguenhance_template.c:179
de_tx_init
static int fn() de_tx_init(AVFilterContext *ctx)
Definition: dialoguenhance_template.c:68
EPSILON
#define EPSILON
Definition: dialoguenhance_template.c:61
ctx
AVFormatContext * ctx
Definition: movenc.c:48
get_centere
static void fn() get_centere(ctype *left, ctype *right, ctype *center, int N)
Definition: dialoguenhance_template.c:116
SIN
#define SIN
Definition: dialoguenhance_template.c:59
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ctype
#define ctype
Definition: dialoguenhance_template.c:53
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: vvc_intra.c:291
ONE
#define ONE
Definition: dialoguenhance_template.c:56
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
N
#define N
Definition: af_mcompand.c:53
M_PI
#define M_PI
Definition: mathematics.h:67
internal.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
TX_TYPE
#define TX_TYPE
Definition: dialoguenhance_template.c:55
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
de_stereo
static int fn() de_stereo(AVFilterContext *ctx, AVFrame *out)
Definition: dialoguenhance_template.c:201
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:262
apply_window
static void fn() apply_window(AudioDialogueEnhanceContext *s, const ftype *in_frame, ftype *out_frame, const int add_to_out_frame)
Definition: dialoguenhance_template.c:96
ret
ret
Definition: filter_design.txt:187
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
avfilter.h
G
#define G
Definition: huffyuv.h:43
AVFilterContext
An instance of a filter.
Definition: avfilter.h:407
audio.h
SQRT
#define SQRT
Definition: dialoguenhance_template.c:51
tx.h
flux
static ftype fn() flux(ftype *curf, ftype *prevf, int N)
Definition: dialoguenhance_template.c:132