FFmpeg
af_arnndn.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018 Gregor Richards
3  * Copyright (c) 2017 Mozilla
4  * Copyright (c) 2005-2009 Xiph.Org Foundation
5  * Copyright (c) 2007-2008 CSIRO
6  * Copyright (c) 2008-2011 Octasic Inc.
7  * Copyright (c) Jean-Marc Valin
8  * Copyright (c) 2019 Paul B Mahol
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * - Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  *
17  * - Redistributions in binary form must reproduce the above copyright
18  * notice, this list of conditions and the following disclaimer in the
19  * documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "libavutil/avassert.h"
35 #include "libavutil/file_open.h"
36 #include "libavutil/float_dsp.h"
37 #include "libavutil/mem.h"
38 #include "libavutil/mem_internal.h"
39 #include "libavutil/opt.h"
40 #include "libavutil/tx.h"
41 #include "avfilter.h"
42 #include "audio.h"
43 #include "filters.h"
44 #include "formats.h"
45 
46 #define FRAME_SIZE_SHIFT 2
47 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
48 #define WINDOW_SIZE (2*FRAME_SIZE)
49 #define FREQ_SIZE (FRAME_SIZE + 1)
50 
51 #define PITCH_MIN_PERIOD 60
52 #define PITCH_MAX_PERIOD 768
53 #define PITCH_FRAME_SIZE 960
54 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
55 
56 #define SQUARE(x) ((x)*(x))
57 
58 #define NB_BANDS 22
59 
60 #define CEPS_MEM 8
61 #define NB_DELTA_CEPS 6
62 
63 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
64 
65 #define WEIGHTS_SCALE (1.f/256)
66 
67 #define MAX_NEURONS 128
68 
69 #define ACTIVATION_TANH 0
70 #define ACTIVATION_SIGMOID 1
71 #define ACTIVATION_RELU 2
72 
73 #define Q15ONE 1.0f
74 
75 typedef struct DenseLayer {
76  const float *bias;
77  const float *input_weights;
78  int nb_inputs;
81 } DenseLayer;
82 
83 typedef struct GRULayer {
84  const float *bias;
85  const float *input_weights;
86  const float *recurrent_weights;
87  int nb_inputs;
90 } GRULayer;
91 
92 typedef struct RNNModel {
95 
97  const GRULayer *vad_gru;
98 
101 
104 
107 
110 } RNNModel;
111 
112 typedef struct RNNState {
117 } RNNState;
118 
119 typedef struct DenoiseState {
122  int memid;
126  float last_gain;
128  float mem_hp_x[2];
129  float lastg[NB_BANDS];
134 } DenoiseState;
135 
136 typedef struct AudioRNNContext {
137  const AVClass *class;
138 
139  char *model_name;
140  float mix;
141 
142  int channels;
144 
147 
149 
152 
153 #define F_ACTIVATION_TANH 0
154 #define F_ACTIVATION_SIGMOID 1
155 #define F_ACTIVATION_RELU 2
156 
157 static void rnnoise_model_free(RNNModel *model)
158 {
159 #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)
160 #define FREE_DENSE(name) do { \
161  if (model->name) { \
162  av_free((void *) model->name->input_weights); \
163  av_free((void *) model->name->bias); \
164  av_free((void *) model->name); \
165  } \
166  } while (0)
167 #define FREE_GRU(name) do { \
168  if (model->name) { \
169  av_free((void *) model->name->input_weights); \
170  av_free((void *) model->name->recurrent_weights); \
171  av_free((void *) model->name->bias); \
172  av_free((void *) model->name); \
173  } \
174  } while (0)
175 
176  if (!model)
177  return;
178  FREE_DENSE(input_dense);
179  FREE_GRU(vad_gru);
180  FREE_GRU(noise_gru);
181  FREE_GRU(denoise_gru);
182  FREE_DENSE(denoise_output);
183  FREE_DENSE(vad_output);
184  av_free(model);
185 }
186 
187 static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
188 {
189  RNNModel *ret = NULL;
190  DenseLayer *input_dense;
191  GRULayer *vad_gru;
192  GRULayer *noise_gru;
193  GRULayer *denoise_gru;
194  DenseLayer *denoise_output;
195  DenseLayer *vad_output;
196  int in;
197 
198  if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
199  return AVERROR_INVALIDDATA;
200 
201  ret = av_calloc(1, sizeof(RNNModel));
202  if (!ret)
203  return AVERROR(ENOMEM);
204 
205 #define ALLOC_LAYER(type, name) \
206  name = av_calloc(1, sizeof(type)); \
207  if (!name) { \
208  rnnoise_model_free(ret); \
209  return AVERROR(ENOMEM); \
210  } \
211  ret->name = name
212 
213  ALLOC_LAYER(DenseLayer, input_dense);
214  ALLOC_LAYER(GRULayer, vad_gru);
215  ALLOC_LAYER(GRULayer, noise_gru);
216  ALLOC_LAYER(GRULayer, denoise_gru);
217  ALLOC_LAYER(DenseLayer, denoise_output);
218  ALLOC_LAYER(DenseLayer, vad_output);
219 
220 #define INPUT_VAL(name) do { \
221  if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
222  rnnoise_model_free(ret); \
223  return AVERROR(EINVAL); \
224  } \
225  name = in; \
226  } while (0)
227 
228 #define INPUT_ACTIVATION(name) do { \
229  int activation; \
230  INPUT_VAL(activation); \
231  switch (activation) { \
232  case F_ACTIVATION_SIGMOID: \
233  name = ACTIVATION_SIGMOID; \
234  break; \
235  case F_ACTIVATION_RELU: \
236  name = ACTIVATION_RELU; \
237  break; \
238  default: \
239  name = ACTIVATION_TANH; \
240  } \
241  } while (0)
242 
243 #define INPUT_ARRAY(name, len) do { \
244  float *values = av_calloc((len), sizeof(float)); \
245  if (!values) { \
246  rnnoise_model_free(ret); \
247  return AVERROR(ENOMEM); \
248  } \
249  name = values; \
250  for (int i = 0; i < (len); i++) { \
251  if (fscanf(f, "%d", &in) != 1) { \
252  rnnoise_model_free(ret); \
253  return AVERROR(EINVAL); \
254  } \
255  values[i] = in; \
256  } \
257  } while (0)
258 
259 #define INPUT_ARRAY3(name, len0, len1, len2) do { \
260  float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
261  if (!values) { \
262  rnnoise_model_free(ret); \
263  return AVERROR(ENOMEM); \
264  } \
265  name = values; \
266  for (int k = 0; k < (len0); k++) { \
267  for (int i = 0; i < (len2); i++) { \
268  for (int j = 0; j < (len1); j++) { \
269  if (fscanf(f, "%d", &in) != 1) { \
270  rnnoise_model_free(ret); \
271  return AVERROR(EINVAL); \
272  } \
273  values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
274  } \
275  } \
276  } \
277  } while (0)
278 
279 #define NEW_LINE() do { \
280  int c; \
281  while ((c = fgetc(f)) != EOF) { \
282  if (c == '\n') \
283  break; \
284  } \
285  } while (0)
286 
287 #define INPUT_DENSE(name) do { \
288  INPUT_VAL(name->nb_inputs); \
289  INPUT_VAL(name->nb_neurons); \
290  ret->name ## _size = name->nb_neurons; \
291  INPUT_ACTIVATION(name->activation); \
292  NEW_LINE(); \
293  INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
294  NEW_LINE(); \
295  INPUT_ARRAY(name->bias, name->nb_neurons); \
296  NEW_LINE(); \
297  } while (0)
298 
299 #define INPUT_GRU(name) do { \
300  INPUT_VAL(name->nb_inputs); \
301  INPUT_VAL(name->nb_neurons); \
302  ret->name ## _size = name->nb_neurons; \
303  INPUT_ACTIVATION(name->activation); \
304  NEW_LINE(); \
305  INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
306  NEW_LINE(); \
307  INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
308  NEW_LINE(); \
309  INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
310  NEW_LINE(); \
311  } while (0)
312 
313  INPUT_DENSE(input_dense);
314  INPUT_GRU(vad_gru);
315  INPUT_GRU(noise_gru);
316  INPUT_GRU(denoise_gru);
317  INPUT_DENSE(denoise_output);
318  INPUT_DENSE(vad_output);
319 
320  if (vad_output->nb_neurons != 1) {
322  return AVERROR(EINVAL);
323  }
324 
325  *rnn = ret;
326 
327  return 0;
328 }
329 
331  AVFilterFormatsConfig **cfg_in,
332  AVFilterFormatsConfig **cfg_out)
333 {
334  static const enum AVSampleFormat sample_fmts[] = {
337  };
338  int ret, sample_rates[] = { 48000, -1 };
339 
341  if (ret < 0)
342  return ret;
343 
344  return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
345 }
346 
348 {
349  AVFilterContext *ctx = inlink->dst;
350  AudioRNNContext *s = ctx->priv;
351  int ret = 0;
352 
353  s->channels = inlink->ch_layout.nb_channels;
354 
355  if (!s->st)
356  s->st = av_calloc(s->channels, sizeof(DenoiseState));
357  if (!s->st)
358  return AVERROR(ENOMEM);
359 
360  for (int i = 0; i < s->channels; i++) {
361  DenoiseState *st = &s->st[i];
362 
363  st->rnn[0].model = s->model[0];
364  st->rnn[0].vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->vad_gru_size, 16));
365  st->rnn[0].noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->noise_gru_size, 16));
366  st->rnn[0].denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->denoise_gru_size, 16));
367  if (!st->rnn[0].vad_gru_state ||
368  !st->rnn[0].noise_gru_state ||
369  !st->rnn[0].denoise_gru_state)
370  return AVERROR(ENOMEM);
371  }
372 
373  for (int i = 0; i < s->channels; i++) {
374  DenoiseState *st = &s->st[i];
375  float scale = 1.f;
376 
377  if (!st->tx)
378  ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, &scale, 0);
379  if (ret < 0)
380  return ret;
381 
382  if (!st->txi)
383  ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, &scale, 0);
384  if (ret < 0)
385  return ret;
386  }
387 
388  return ret;
389 }
390 
391 static void biquad(float *y, float mem[2], const float *x,
392  const float *b, const float *a, int N)
393 {
394  for (int i = 0; i < N; i++) {
395  float xi, yi;
396 
397  xi = x[i];
398  yi = x[i] + mem[0];
399  mem[0] = mem[1] + (b[0]*xi - a[0]*yi);
400  mem[1] = (b[1]*xi - a[1]*yi);
401  y[i] = yi;
402  }
403 }
404 
405 #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
406 #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
407 #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
408 
409 static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
410 {
413 
414  for (int i = 0; i < WINDOW_SIZE; i++) {
415  x[i].re = in[i];
416  x[i].im = 0;
417  }
418 
419  st->tx_fn(st->tx, y, x, sizeof(AVComplexFloat));
420 
421  RNN_COPY(out, y, FREQ_SIZE);
422 }
423 
424 static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
425 {
428 
429  RNN_COPY(x, in, FREQ_SIZE);
430 
431  for (int i = FREQ_SIZE; i < WINDOW_SIZE; i++) {
432  x[i].re = x[WINDOW_SIZE - i].re;
433  x[i].im = -x[WINDOW_SIZE - i].im;
434  }
435 
436  st->txi_fn(st->txi, y, x, sizeof(AVComplexFloat));
437 
438  for (int i = 0; i < WINDOW_SIZE; i++)
439  out[i] = y[i].re / WINDOW_SIZE;
440 }
441 
442 static const uint8_t eband5ms[] = {
443 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
444  0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
445 };
446 
447 static void compute_band_energy(float *bandE, const AVComplexFloat *X)
448 {
449  float sum[NB_BANDS] = {0};
450 
451  for (int i = 0; i < NB_BANDS - 1; i++) {
452  int band_size;
453 
454  band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
455  for (int j = 0; j < band_size; j++) {
456  float tmp, frac = (float)j / band_size;
457 
458  tmp = SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].re);
459  tmp += SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].im);
460  sum[i] += (1.f - frac) * tmp;
461  sum[i + 1] += frac * tmp;
462  }
463  }
464 
465  sum[0] *= 2;
466  sum[NB_BANDS - 1] *= 2;
467 
468  for (int i = 0; i < NB_BANDS; i++)
469  bandE[i] = sum[i];
470 }
471 
472 static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
473 {
474  float sum[NB_BANDS] = { 0 };
475 
476  for (int i = 0; i < NB_BANDS - 1; i++) {
477  int band_size;
478 
479  band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
480  for (int j = 0; j < band_size; j++) {
481  float tmp, frac = (float)j / band_size;
482 
483  tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re;
484  tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im;
485  sum[i] += (1 - frac) * tmp;
486  sum[i + 1] += frac * tmp;
487  }
488  }
489 
490  sum[0] *= 2;
491  sum[NB_BANDS-1] *= 2;
492 
493  for (int i = 0; i < NB_BANDS; i++)
494  bandE[i] = sum[i];
495 }
496 
497 static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
498 {
499  LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
500 
502  RNN_COPY(x + FRAME_SIZE, in, FRAME_SIZE);
503  RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
504  s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
505  forward_transform(st, X, x);
506  compute_band_energy(Ex, X);
507 }
508 
509 static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
510 {
511  LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
512  const float *src = st->history;
513  const float mix = s->mix;
514  const float imix = 1.f - FFMAX(mix, 0.f);
515 
516  inverse_transform(st, x, y);
517  s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
518  s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);
519  RNN_COPY(out, x, FRAME_SIZE);
521 
522  for (int n = 0; n < FRAME_SIZE; n++)
523  out[n] = out[n] * mix + src[n] * imix;
524 }
525 
526 static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
527 {
528  float y_0, y_1, y_2, y_3 = 0;
529  int j;
530 
531  y_0 = *y++;
532  y_1 = *y++;
533  y_2 = *y++;
534 
535  for (j = 0; j < len - 3; j += 4) {
536  float tmp;
537 
538  tmp = *x++;
539  y_3 = *y++;
540  sum[0] += tmp * y_0;
541  sum[1] += tmp * y_1;
542  sum[2] += tmp * y_2;
543  sum[3] += tmp * y_3;
544  tmp = *x++;
545  y_0 = *y++;
546  sum[0] += tmp * y_1;
547  sum[1] += tmp * y_2;
548  sum[2] += tmp * y_3;
549  sum[3] += tmp * y_0;
550  tmp = *x++;
551  y_1 = *y++;
552  sum[0] += tmp * y_2;
553  sum[1] += tmp * y_3;
554  sum[2] += tmp * y_0;
555  sum[3] += tmp * y_1;
556  tmp = *x++;
557  y_2 = *y++;
558  sum[0] += tmp * y_3;
559  sum[1] += tmp * y_0;
560  sum[2] += tmp * y_1;
561  sum[3] += tmp * y_2;
562  }
563 
564  if (j++ < len) {
565  float tmp = *x++;
566 
567  y_3 = *y++;
568  sum[0] += tmp * y_0;
569  sum[1] += tmp * y_1;
570  sum[2] += tmp * y_2;
571  sum[3] += tmp * y_3;
572  }
573 
574  if (j++ < len) {
575  float tmp=*x++;
576 
577  y_0 = *y++;
578  sum[0] += tmp * y_1;
579  sum[1] += tmp * y_2;
580  sum[2] += tmp * y_3;
581  sum[3] += tmp * y_0;
582  }
583 
584  if (j < len) {
585  float tmp=*x++;
586 
587  y_1 = *y++;
588  sum[0] += tmp * y_2;
589  sum[1] += tmp * y_3;
590  sum[2] += tmp * y_0;
591  sum[3] += tmp * y_1;
592  }
593 }
594 
595 static inline float celt_inner_prod(const float *x,
596  const float *y, int N)
597 {
598  float xy = 0.f;
599 
600  for (int i = 0; i < N; i++)
601  xy += x[i] * y[i];
602 
603  return xy;
604 }
605 
606 static void celt_pitch_xcorr(const float *x, const float *y,
607  float *xcorr, int len, int max_pitch)
608 {
609  int i;
610 
611  for (i = 0; i < max_pitch - 3; i += 4) {
612  float sum[4] = { 0, 0, 0, 0};
613 
614  xcorr_kernel(x, y + i, sum, len);
615 
616  xcorr[i] = sum[0];
617  xcorr[i + 1] = sum[1];
618  xcorr[i + 2] = sum[2];
619  xcorr[i + 3] = sum[3];
620  }
621  /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
622  for (; i < max_pitch; i++) {
623  xcorr[i] = celt_inner_prod(x, y + i, len);
624  }
625 }
626 
627 static int celt_autocorr(const float *x, /* in: [0...n-1] samples x */
628  float *ac, /* out: [0...lag-1] ac values */
629  const float *window,
630  int overlap,
631  int lag,
632  int n)
633 {
634  int fastN = n - lag;
635  int shift;
636  const float *xptr;
637  float xx[PITCH_BUF_SIZE>>1];
638 
639  if (overlap == 0) {
640  xptr = x;
641  } else {
642  for (int i = 0; i < n; i++)
643  xx[i] = x[i];
644  for (int i = 0; i < overlap; i++) {
645  xx[i] = x[i] * window[i];
646  xx[n-i-1] = x[n-i-1] * window[i];
647  }
648  xptr = xx;
649  }
650 
651  shift = 0;
652  celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
653 
654  for (int k = 0; k <= lag; k++) {
655  float d = 0.f;
656 
657  for (int i = k + fastN; i < n; i++)
658  d += xptr[i] * xptr[i-k];
659  ac[k] += d;
660  }
661 
662  return shift;
663 }
664 
665 static void celt_lpc(float *lpc, /* out: [0...p-1] LPC coefficients */
666  const float *ac, /* in: [0...p] autocorrelation values */
667  int p)
668 {
669  float r, error = ac[0];
670 
671  RNN_CLEAR(lpc, p);
672  if (ac[0] != 0) {
673  for (int i = 0; i < p; i++) {
674  /* Sum up this iteration's reflection coefficient */
675  float rr = 0;
676  for (int j = 0; j < i; j++)
677  rr += (lpc[j] * ac[i - j]);
678  rr += ac[i + 1];
679  r = -rr/error;
680  /* Update LPC coefficients and total error */
681  lpc[i] = r;
682  for (int j = 0; j < (i + 1) >> 1; j++) {
683  float tmp1, tmp2;
684  tmp1 = lpc[j];
685  tmp2 = lpc[i-1-j];
686  lpc[j] = tmp1 + (r*tmp2);
687  lpc[i-1-j] = tmp2 + (r*tmp1);
688  }
689 
690  error = error - (r * r *error);
691  /* Bail out once we get 30 dB gain */
692  if (error < .001f * ac[0])
693  break;
694  }
695  }
696 }
697 
698 static void celt_fir5(const float *x,
699  const float *num,
700  float *y,
701  int N,
702  float *mem)
703 {
704  float num0, num1, num2, num3, num4;
705  float mem0, mem1, mem2, mem3, mem4;
706 
707  num0 = num[0];
708  num1 = num[1];
709  num2 = num[2];
710  num3 = num[3];
711  num4 = num[4];
712  mem0 = mem[0];
713  mem1 = mem[1];
714  mem2 = mem[2];
715  mem3 = mem[3];
716  mem4 = mem[4];
717 
718  for (int i = 0; i < N; i++) {
719  float sum = x[i];
720 
721  sum += (num0*mem0);
722  sum += (num1*mem1);
723  sum += (num2*mem2);
724  sum += (num3*mem3);
725  sum += (num4*mem4);
726  mem4 = mem3;
727  mem3 = mem2;
728  mem2 = mem1;
729  mem1 = mem0;
730  mem0 = x[i];
731  y[i] = sum;
732  }
733 
734  mem[0] = mem0;
735  mem[1] = mem1;
736  mem[2] = mem2;
737  mem[3] = mem3;
738  mem[4] = mem4;
739 }
740 
741 static void pitch_downsample(float *x[], float *x_lp,
742  int len, int C)
743 {
744  float ac[5];
745  float tmp=Q15ONE;
746  float lpc[4], mem[5]={0,0,0,0,0};
747  float lpc2[5];
748  float c1 = .8f;
749 
750  for (int i = 1; i < len >> 1; i++)
751  x_lp[i] = .5f * (.5f * (x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]);
752  x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);
753  if (C==2) {
754  for (int i = 1; i < len >> 1; i++)
755  x_lp[i] += (.5f * (.5f * (x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]));
756  x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);
757  }
758 
759  celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1);
760 
761  /* Noise floor -40 dB */
762  ac[0] *= 1.0001f;
763  /* Lag windowing */
764  for (int i = 1; i <= 4; i++) {
765  /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
766  ac[i] -= ac[i]*(.008f*i)*(.008f*i);
767  }
768 
769  celt_lpc(lpc, ac, 4);
770  for (int i = 0; i < 4; i++) {
771  tmp = .9f * tmp;
772  lpc[i] = (lpc[i] * tmp);
773  }
774  /* Add a zero */
775  lpc2[0] = lpc[0] + .8f;
776  lpc2[1] = lpc[1] + (c1 * lpc[0]);
777  lpc2[2] = lpc[2] + (c1 * lpc[1]);
778  lpc2[3] = lpc[3] + (c1 * lpc[2]);
779  lpc2[4] = (c1 * lpc[3]);
780  celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
781 }
782 
783 static inline void dual_inner_prod(const float *x, const float *y01, const float *y02,
784  int N, float *xy1, float *xy2)
785 {
786  float xy01 = 0, xy02 = 0;
787 
788  for (int i = 0; i < N; i++) {
789  xy01 += (x[i] * y01[i]);
790  xy02 += (x[i] * y02[i]);
791  }
792 
793  *xy1 = xy01;
794  *xy2 = xy02;
795 }
796 
797 static float compute_pitch_gain(float xy, float xx, float yy)
798 {
799  return xy / sqrtf(1.f + xx * yy);
800 }
801 
802 static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
803 static float remove_doubling(float *x, int maxperiod, int minperiod, int N,
804  int *T0_, int prev_period, float prev_gain)
805 {
806  int k, i, T, T0;
807  float g, g0;
808  float pg;
809  float xy,xx,yy,xy2;
810  float xcorr[3];
811  float best_xy, best_yy;
812  int offset;
813  int minperiod0;
814  float yy_lookup[PITCH_MAX_PERIOD+1];
815 
816  minperiod0 = minperiod;
817  maxperiod /= 2;
818  minperiod /= 2;
819  *T0_ /= 2;
820  prev_period /= 2;
821  N /= 2;
822  x += maxperiod;
823  if (*T0_>=maxperiod)
824  *T0_=maxperiod-1;
825 
826  T = T0 = *T0_;
827  dual_inner_prod(x, x, x-T0, N, &xx, &xy);
828  yy_lookup[0] = xx;
829  yy=xx;
830  for (i = 1; i <= maxperiod; i++) {
831  yy = yy+(x[-i] * x[-i])-(x[N-i] * x[N-i]);
832  yy_lookup[i] = FFMAX(0, yy);
833  }
834  yy = yy_lookup[T0];
835  best_xy = xy;
836  best_yy = yy;
837  g = g0 = compute_pitch_gain(xy, xx, yy);
838  /* Look for any pitch at T/k */
839  for (k = 2; k <= 15; k++) {
840  int T1, T1b;
841  float g1;
842  float cont=0;
843  float thresh;
844  T1 = (2*T0+k)/(2*k);
845  if (T1 < minperiod)
846  break;
847  /* Look for another strong correlation at T1b */
848  if (k==2)
849  {
850  if (T1+T0>maxperiod)
851  T1b = T0;
852  else
853  T1b = T0+T1;
854  } else
855  {
856  T1b = (2*second_check[k]*T0+k)/(2*k);
857  }
858  dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
859  xy = .5f * (xy + xy2);
860  yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);
861  g1 = compute_pitch_gain(xy, xx, yy);
862  if (FFABS(T1-prev_period)<=1)
863  cont = prev_gain;
864  else if (FFABS(T1-prev_period)<=2 && 5 * k * k < T0)
865  cont = prev_gain * .5f;
866  else
867  cont = 0;
868  thresh = FFMAX(.3f, (.7f * g0) - cont);
869  /* Bias against very high pitch (very short period) to avoid false-positives
870  due to short-term correlation */
871  if (T1<3*minperiod)
872  thresh = FFMAX(.4f, (.85f * g0) - cont);
873  else if (T1<2*minperiod)
874  thresh = FFMAX(.5f, (.9f * g0) - cont);
875  if (g1 > thresh)
876  {
877  best_xy = xy;
878  best_yy = yy;
879  T = T1;
880  g = g1;
881  }
882  }
883  best_xy = FFMAX(0, best_xy);
884  if (best_yy <= best_xy)
885  pg = Q15ONE;
886  else
887  pg = best_xy/(best_yy + 1);
888 
889  for (k = 0; k < 3; k++)
890  xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
891  if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))
892  offset = 1;
893  else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))
894  offset = -1;
895  else
896  offset = 0;
897  if (pg > g)
898  pg = g;
899  *T0_ = 2*T+offset;
900 
901  if (*T0_<minperiod0)
902  *T0_=minperiod0;
903  return pg;
904 }
905 
906 static void find_best_pitch(float *xcorr, float *y, int len,
907  int max_pitch, int *best_pitch)
908 {
909  float best_num[2];
910  float best_den[2];
911  float Syy = 1.f;
912 
913  best_num[0] = -1;
914  best_num[1] = -1;
915  best_den[0] = 0;
916  best_den[1] = 0;
917  best_pitch[0] = 0;
918  best_pitch[1] = 1;
919 
920  for (int j = 0; j < len; j++)
921  Syy += y[j] * y[j];
922 
923  for (int i = 0; i < max_pitch; i++) {
924  if (xcorr[i]>0) {
925  float num;
926  float xcorr16;
927 
928  xcorr16 = xcorr[i];
929  /* Considering the range of xcorr16, this should avoid both underflows
930  and overflows (inf) when squaring xcorr16 */
931  xcorr16 *= 1e-12f;
932  num = xcorr16 * xcorr16;
933  if ((num * best_den[1]) > (best_num[1] * Syy)) {
934  if ((num * best_den[0]) > (best_num[0] * Syy)) {
935  best_num[1] = best_num[0];
936  best_den[1] = best_den[0];
937  best_pitch[1] = best_pitch[0];
938  best_num[0] = num;
939  best_den[0] = Syy;
940  best_pitch[0] = i;
941  } else {
942  best_num[1] = num;
943  best_den[1] = Syy;
944  best_pitch[1] = i;
945  }
946  }
947  }
948  Syy += y[i+len]*y[i+len] - y[i] * y[i];
949  Syy = FFMAX(1, Syy);
950  }
951 }
952 
953 static void pitch_search(const float *x_lp, float *y,
954  int len, int max_pitch, int *pitch)
955 {
956  int lag;
957  int best_pitch[2]={0,0};
958  int offset;
959 
960  float x_lp4[WINDOW_SIZE];
961  float y_lp4[WINDOW_SIZE];
962  float xcorr[WINDOW_SIZE];
963 
964  lag = len+max_pitch;
965 
966  /* Downsample by 2 again */
967  for (int j = 0; j < len >> 2; j++)
968  x_lp4[j] = x_lp[2*j];
969  for (int j = 0; j < lag >> 2; j++)
970  y_lp4[j] = y[2*j];
971 
972  /* Coarse search with 4x decimation */
973 
974  celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
975 
976  find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch);
977 
978  /* Finer search with 2x decimation */
979  for (int i = 0; i < max_pitch >> 1; i++) {
980  float sum;
981  xcorr[i] = 0;
982  if (FFABS(i-2*best_pitch[0])>2 && FFABS(i-2*best_pitch[1])>2)
983  continue;
984  sum = celt_inner_prod(x_lp, y+i, len>>1);
985  xcorr[i] = FFMAX(-1, sum);
986  }
987 
988  find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch);
989 
990  /* Refine by pseudo-interpolation */
991  if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {
992  float a, b, c;
993 
994  a = xcorr[best_pitch[0] - 1];
995  b = xcorr[best_pitch[0]];
996  c = xcorr[best_pitch[0] + 1];
997  if (c - a > .7f * (b - a))
998  offset = 1;
999  else if (a - c > .7f * (b-c))
1000  offset = -1;
1001  else
1002  offset = 0;
1003  } else {
1004  offset = 0;
1005  }
1006 
1007  *pitch = 2 * best_pitch[0] - offset;
1008 }
1009 
1010 static void dct(AudioRNNContext *s, float *out, const float *in)
1011 {
1012  for (int i = 0; i < NB_BANDS; i++) {
1013  float sum;
1014 
1015  sum = s->fdsp->scalarproduct_float(in, s->dct_table[i], FFALIGN(NB_BANDS, 4));
1016  out[i] = sum * sqrtf(2.f / 22);
1017  }
1018 }
1019 
1021  float *Ex, float *Ep, float *Exp, float *features, const float *in)
1022 {
1023  float E = 0;
1024  float *ceps_0, *ceps_1, *ceps_2;
1025  float spec_variability = 0;
1026  LOCAL_ALIGNED_32(float, Ly, [NB_BANDS]);
1027  LOCAL_ALIGNED_32(float, p, [WINDOW_SIZE]);
1028  float pitch_buf[PITCH_BUF_SIZE>>1];
1029  int pitch_index;
1030  float gain;
1031  float *(pre[1]);
1032  float tmp[NB_BANDS];
1033  float follow, logMax;
1034 
1035  frame_analysis(s, st, X, Ex, in);
1038  pre[0] = &st->pitch_buf[0];
1039  pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
1040  pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
1041  PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
1042  pitch_index = PITCH_MAX_PERIOD-pitch_index;
1043 
1045  PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
1046  st->last_period = pitch_index;
1047  st->last_gain = gain;
1048 
1049  for (int i = 0; i < WINDOW_SIZE; i++)
1050  p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
1051 
1052  s->fdsp->vector_fmul(p, p, s->window, WINDOW_SIZE);
1053  forward_transform(st, P, p);
1054  compute_band_energy(Ep, P);
1055  compute_band_corr(Exp, X, P);
1056 
1057  for (int i = 0; i < NB_BANDS; i++)
1058  Exp[i] = Exp[i] / sqrtf(.001f+Ex[i]*Ep[i]);
1059 
1060  dct(s, tmp, Exp);
1061 
1062  for (int i = 0; i < NB_DELTA_CEPS; i++)
1063  features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
1064 
1065  features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
1066  features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
1067  features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
1068  logMax = -2;
1069  follow = -2;
1070 
1071  for (int i = 0; i < NB_BANDS; i++) {
1072  Ly[i] = log10f(1e-2f + Ex[i]);
1073  Ly[i] = FFMAX(logMax-7, FFMAX(follow-1.5, Ly[i]));
1074  logMax = FFMAX(logMax, Ly[i]);
1075  follow = FFMAX(follow-1.5, Ly[i]);
1076  E += Ex[i];
1077  }
1078 
1079  if (E < 0.04f) {
1080  /* If there's no audio, avoid messing up the state. */
1081  RNN_CLEAR(features, NB_FEATURES);
1082  return 1;
1083  }
1084 
1085  dct(s, features, Ly);
1086  features[0] -= 12;
1087  features[1] -= 4;
1088  ceps_0 = st->cepstral_mem[st->memid];
1089  ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
1090  ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
1091 
1092  for (int i = 0; i < NB_BANDS; i++)
1093  ceps_0[i] = features[i];
1094 
1095  st->memid++;
1096  for (int i = 0; i < NB_DELTA_CEPS; i++) {
1097  features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
1098  features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
1099  features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
1100  }
1101  /* Spectral variability features. */
1102  if (st->memid == CEPS_MEM)
1103  st->memid = 0;
1104 
1105  for (int i = 0; i < CEPS_MEM; i++) {
1106  float mindist = 1e15f;
1107  for (int j = 0; j < CEPS_MEM; j++) {
1108  float dist = 0.f;
1109  for (int k = 0; k < NB_BANDS; k++) {
1110  float tmp;
1111 
1112  tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
1113  dist += tmp*tmp;
1114  }
1115 
1116  if (j != i)
1117  mindist = FFMIN(mindist, dist);
1118  }
1119 
1120  spec_variability += mindist;
1121  }
1122 
1123  features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
1124 
1125  return 0;
1126 }
1127 
1128 static void interp_band_gain(float *g, const float *bandE)
1129 {
1130  memset(g, 0, sizeof(*g) * FREQ_SIZE);
1131 
1132  for (int i = 0; i < NB_BANDS - 1; i++) {
1133  const int band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
1134 
1135  for (int j = 0; j < band_size; j++) {
1136  float frac = (float)j / band_size;
1137 
1138  g[(eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1.f - frac) * bandE[i] + frac * bandE[i + 1];
1139  }
1140  }
1141 }
1142 
1143 static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep,
1144  const float *Exp, const float *g)
1145 {
1146  float newE[NB_BANDS];
1147  float r[NB_BANDS];
1148  float norm[NB_BANDS];
1149  float rf[FREQ_SIZE] = {0};
1150  float normf[FREQ_SIZE]={0};
1151 
1152  for (int i = 0; i < NB_BANDS; i++) {
1153  if (Exp[i]>g[i]) r[i] = 1;
1154  else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
1155  r[i] = sqrtf(av_clipf(r[i], 0, 1));
1156  r[i] *= sqrtf(Ex[i]/(1e-8+Ep[i]));
1157  }
1158  interp_band_gain(rf, r);
1159  for (int i = 0; i < FREQ_SIZE; i++) {
1160  X[i].re += rf[i]*P[i].re;
1161  X[i].im += rf[i]*P[i].im;
1162  }
1163  compute_band_energy(newE, X);
1164  for (int i = 0; i < NB_BANDS; i++) {
1165  norm[i] = sqrtf(Ex[i] / (1e-8+newE[i]));
1166  }
1167  interp_band_gain(normf, norm);
1168  for (int i = 0; i < FREQ_SIZE; i++) {
1169  X[i].re *= normf[i];
1170  X[i].im *= normf[i];
1171  }
1172 }
1173 
1174 static const float tansig_table[201] = {
1175  0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
1176  0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
1177  0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
1178  0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
1179  0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
1180  0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
1181  0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
1182  0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
1183  0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
1184  0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
1185  0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
1186  0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
1187  0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
1188  0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
1189  0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
1190  0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
1191  0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
1192  0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
1193  0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
1194  0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
1195  0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
1196  0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
1197  0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
1198  0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
1199  0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
1200  0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
1201  0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
1202  0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
1203  0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
1204  0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
1205  0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
1206  0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
1207  0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
1208  0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
1209  0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
1210  0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
1211  0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1212  0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1213  1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1214  1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1215  1.000000f,
1216 };
1217 
1218 static inline float tansig_approx(float x)
1219 {
1220  float y, dy;
1221  float sign=1;
1222  int i;
1223 
1224  /* Tests are reversed to catch NaNs */
1225  if (!(x<8))
1226  return 1;
1227  if (!(x>-8))
1228  return -1;
1229  /* Another check in case of -ffast-math */
1230 
1231  if (isnan(x))
1232  return 0;
1233 
1234  if (x < 0) {
1235  x=-x;
1236  sign=-1;
1237  }
1238  i = (int)floor(.5f+25*x);
1239  x -= .04f*i;
1240  y = tansig_table[i];
1241  dy = 1-y*y;
1242  y = y + x*dy*(1 - y*x);
1243  return sign*y;
1244 }
1245 
1246 static inline float sigmoid_approx(float x)
1247 {
1248  return .5f + .5f*tansig_approx(.5f*x);
1249 }
1250 
1251 static void compute_dense(const DenseLayer *layer, float *output, const float *input)
1252 {
1253  const int N = layer->nb_neurons, M = layer->nb_inputs, stride = N;
1254 
1255  for (int i = 0; i < N; i++) {
1256  /* Compute update gate. */
1257  float sum = layer->bias[i];
1258 
1259  for (int j = 0; j < M; j++)
1260  sum += layer->input_weights[j * stride + i] * input[j];
1261 
1262  output[i] = WEIGHTS_SCALE * sum;
1263  }
1264 
1265  if (layer->activation == ACTIVATION_SIGMOID) {
1266  for (int i = 0; i < N; i++)
1268  } else if (layer->activation == ACTIVATION_TANH) {
1269  for (int i = 0; i < N; i++)
1270  output[i] = tansig_approx(output[i]);
1271  } else if (layer->activation == ACTIVATION_RELU) {
1272  for (int i = 0; i < N; i++)
1273  output[i] = FFMAX(0, output[i]);
1274  } else {
1275  av_assert0(0);
1276  }
1277 }
1278 
1279 static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
1280 {
1281  LOCAL_ALIGNED_32(float, z, [MAX_NEURONS]);
1282  LOCAL_ALIGNED_32(float, r, [MAX_NEURONS]);
1283  LOCAL_ALIGNED_32(float, h, [MAX_NEURONS]);
1284  const int M = gru->nb_inputs;
1285  const int N = gru->nb_neurons;
1286  const int AN = FFALIGN(N, 4);
1287  const int AM = FFALIGN(M, 4);
1288  const int stride = 3 * AN, istride = 3 * AM;
1289 
1290  for (int i = 0; i < N; i++) {
1291  /* Compute update gate. */
1292  float sum = gru->bias[i];
1293 
1294  sum += s->fdsp->scalarproduct_float(gru->input_weights + i * istride, input, AM);
1295  sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + i * stride, state, AN);
1296  z[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1297  }
1298 
1299  for (int i = 0; i < N; i++) {
1300  /* Compute reset gate. */
1301  float sum = gru->bias[N + i];
1302 
1303  sum += s->fdsp->scalarproduct_float(gru->input_weights + AM + i * istride, input, AM);
1304  sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + AN + i * stride, state, AN);
1305  r[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1306  }
1307 
1308  for (int i = 0; i < N; i++) {
1309  /* Compute output. */
1310  float sum = gru->bias[2 * N + i];
1311 
1312  sum += s->fdsp->scalarproduct_float(gru->input_weights + 2 * AM + i * istride, input, AM);
1313  for (int j = 0; j < N; j++)
1314  sum += gru->recurrent_weights[2 * AN + i * stride + j] * state[j] * r[j];
1315 
1316  if (gru->activation == ACTIVATION_SIGMOID)
1317  sum = sigmoid_approx(WEIGHTS_SCALE * sum);
1318  else if (gru->activation == ACTIVATION_TANH)
1319  sum = tansig_approx(WEIGHTS_SCALE * sum);
1320  else if (gru->activation == ACTIVATION_RELU)
1321  sum = FFMAX(0, WEIGHTS_SCALE * sum);
1322  else
1323  av_assert0(0);
1324  h[i] = z[i] * state[i] + (1.f - z[i]) * sum;
1325  }
1326 
1327  RNN_COPY(state, h, N);
1328 }
1329 
1330 #define INPUT_SIZE 42
1331 
1332 static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
1333 {
1334  LOCAL_ALIGNED_32(float, dense_out, [MAX_NEURONS]);
1335  LOCAL_ALIGNED_32(float, noise_input, [MAX_NEURONS * 3]);
1336  LOCAL_ALIGNED_32(float, denoise_input, [MAX_NEURONS * 3]);
1337 
1338  compute_dense(rnn->model->input_dense, dense_out, input);
1339  compute_gru(s, rnn->model->vad_gru, rnn->vad_gru_state, dense_out);
1340  compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state);
1341 
1342  memcpy(noise_input, dense_out, rnn->model->input_dense_size * sizeof(float));
1343  memcpy(noise_input + rnn->model->input_dense_size,
1344  rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1345  memcpy(noise_input + rnn->model->input_dense_size + rnn->model->vad_gru_size,
1346  input, INPUT_SIZE * sizeof(float));
1347 
1348  compute_gru(s, rnn->model->noise_gru, rnn->noise_gru_state, noise_input);
1349 
1350  memcpy(denoise_input, rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1351  memcpy(denoise_input + rnn->model->vad_gru_size,
1352  rnn->noise_gru_state, rnn->model->noise_gru_size * sizeof(float));
1353  memcpy(denoise_input + rnn->model->vad_gru_size + rnn->model->noise_gru_size,
1354  input, INPUT_SIZE * sizeof(float));
1355 
1356  compute_gru(s, rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input);
1358 }
1359 
1360 static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,
1361  int disabled)
1362 {
1365  float x[FRAME_SIZE];
1366  float Ex[NB_BANDS], Ep[NB_BANDS];
1367  LOCAL_ALIGNED_32(float, Exp, [NB_BANDS]);
1368  float features[NB_FEATURES];
1369  float g[NB_BANDS];
1370  float gf[FREQ_SIZE];
1371  float vad_prob = 0;
1372  float *history = st->history;
1373  static const float a_hp[2] = {-1.99599, 0.99600};
1374  static const float b_hp[2] = {-2, 1};
1375  int silence;
1376 
1377  biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
1378  silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);
1379 
1380  if (!silence && !disabled) {
1381  compute_rnn(s, &st->rnn[0], g, &vad_prob, features);
1382  pitch_filter(X, P, Ex, Ep, Exp, g);
1383  for (int i = 0; i < NB_BANDS; i++) {
1384  float alpha = .6f;
1385 
1386  g[i] = FFMAX(g[i], alpha * st->lastg[i]);
1387  st->lastg[i] = g[i];
1388  }
1389 
1390  interp_band_gain(gf, g);
1391 
1392  for (int i = 0; i < FREQ_SIZE; i++) {
1393  X[i].re *= gf[i];
1394  X[i].im *= gf[i];
1395  }
1396  }
1397 
1398  frame_synthesis(s, st, out, X);
1399  memcpy(history, in, FRAME_SIZE * sizeof(*history));
1400 
1401  return vad_prob;
1402 }
1403 
1404 typedef struct ThreadData {
1405  AVFrame *in, *out;
1406 } ThreadData;
1407 
1408 static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
1409 {
1410  AudioRNNContext *s = ctx->priv;
1411  ThreadData *td = arg;
1412  AVFrame *in = td->in;
1413  AVFrame *out = td->out;
1414  const int start = (out->ch_layout.nb_channels * jobnr) / nb_jobs;
1415  const int end = (out->ch_layout.nb_channels * (jobnr+1)) / nb_jobs;
1416 
1417  for (int ch = start; ch < end; ch++) {
1418  rnnoise_channel(s, &s->st[ch],
1419  (float *)out->extended_data[ch],
1420  (const float *)in->extended_data[ch],
1421  ctx->is_disabled);
1422  }
1423 
1424  return 0;
1425 }
1426 
1428 {
1429  AVFilterContext *ctx = inlink->dst;
1430  AVFilterLink *outlink = ctx->outputs[0];
1431  AVFrame *out = NULL;
1432  ThreadData td;
1433 
1434  out = ff_get_audio_buffer(outlink, FRAME_SIZE);
1435  if (!out) {
1436  av_frame_free(&in);
1437  return AVERROR(ENOMEM);
1438  }
1439  av_frame_copy_props(out, in);
1440 
1441  td.in = in; td.out = out;
1444 
1445  av_frame_free(&in);
1446  return ff_filter_frame(outlink, out);
1447 }
1448 
1450 {
1451  AVFilterLink *inlink = ctx->inputs[0];
1452  AVFilterLink *outlink = ctx->outputs[0];
1453  AVFrame *in = NULL;
1454  int ret;
1455 
1457 
1459  if (ret < 0)
1460  return ret;
1461 
1462  if (ret > 0)
1463  return filter_frame(inlink, in);
1464 
1465  FF_FILTER_FORWARD_STATUS(inlink, outlink);
1466  FF_FILTER_FORWARD_WANTED(outlink, inlink);
1467 
1468  return FFERROR_NOT_READY;
1469 }
1470 
1472 {
1473  AudioRNNContext *s = ctx->priv;
1474  int ret;
1475  FILE *f;
1476 
1477  if (!s->model_name)
1478  return AVERROR(EINVAL);
1479  f = avpriv_fopen_utf8(s->model_name, "r");
1480  if (!f) {
1481  av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);
1482  return AVERROR(EINVAL);
1483  }
1484 
1485  ret = rnnoise_model_from_file(f, model);
1486  fclose(f);
1487  if (!*model || ret < 0)
1488  return ret;
1489 
1490  return 0;
1491 }
1492 
1494 {
1495  AudioRNNContext *s = ctx->priv;
1496  int ret;
1497 
1498  s->fdsp = avpriv_float_dsp_alloc(0);
1499  if (!s->fdsp)
1500  return AVERROR(ENOMEM);
1501 
1502  ret = open_model(ctx, &s->model[0]);
1503  if (ret < 0)
1504  return ret;
1505 
1506  for (int i = 0; i < FRAME_SIZE; i++) {
1507  s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
1508  s->window[WINDOW_SIZE - 1 - i] = s->window[i];
1509  }
1510 
1511  for (int i = 0; i < NB_BANDS; i++) {
1512  for (int j = 0; j < NB_BANDS; j++) {
1513  s->dct_table[j][i] = cosf((i + .5f) * j * M_PI / NB_BANDS);
1514  if (j == 0)
1515  s->dct_table[j][i] *= sqrtf(.5);
1516  }
1517  }
1518 
1519  return 0;
1520 }
1521 
1522 static void free_model(AVFilterContext *ctx, int n)
1523 {
1524  AudioRNNContext *s = ctx->priv;
1525 
1526  rnnoise_model_free(s->model[n]);
1527  s->model[n] = NULL;
1528 
1529  for (int ch = 0; ch < s->channels && s->st; ch++) {
1530  av_freep(&s->st[ch].rnn[n].vad_gru_state);
1531  av_freep(&s->st[ch].rnn[n].noise_gru_state);
1532  av_freep(&s->st[ch].rnn[n].denoise_gru_state);
1533  }
1534 }
1535 
1536 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
1537  char *res, int res_len, int flags)
1538 {
1539  AudioRNNContext *s = ctx->priv;
1540  int ret;
1541 
1542  ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
1543  if (ret < 0)
1544  return ret;
1545 
1546  ret = open_model(ctx, &s->model[1]);
1547  if (ret < 0)
1548  return ret;
1549 
1550  FFSWAP(RNNModel *, s->model[0], s->model[1]);
1551  for (int ch = 0; ch < s->channels; ch++)
1552  FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1553 
1554  ret = config_input(ctx->inputs[0]);
1555  if (ret < 0) {
1556  for (int ch = 0; ch < s->channels; ch++)
1557  FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1558  FFSWAP(RNNModel *, s->model[0], s->model[1]);
1559  return ret;
1560  }
1561 
1562  free_model(ctx, 1);
1563  return 0;
1564 }
1565 
1567 {
1568  AudioRNNContext *s = ctx->priv;
1569 
1570  av_freep(&s->fdsp);
1571  free_model(ctx, 0);
1572  for (int ch = 0; ch < s->channels && s->st; ch++) {
1573  av_tx_uninit(&s->st[ch].tx);
1574  av_tx_uninit(&s->st[ch].txi);
1575  }
1576  av_freep(&s->st);
1577 }
1578 
1579 static const AVFilterPad inputs[] = {
1580  {
1581  .name = "default",
1582  .type = AVMEDIA_TYPE_AUDIO,
1583  .config_props = config_input,
1584  },
1585 };
1586 
1587 #define OFFSET(x) offsetof(AudioRNNContext, x)
1588 #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
1589 
1590 static const AVOption arnndn_options[] = {
1591  { "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1592  { "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1593  { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },
1594  { NULL }
1595 };
1596 
1597 AVFILTER_DEFINE_CLASS(arnndn);
1598 
1600  .name = "arnndn",
1601  .description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),
1602  .priv_size = sizeof(AudioRNNContext),
1603  .priv_class = &arnndn_class,
1604  .activate = activate,
1605  .init = init,
1606  .uninit = uninit,
1612  .process_command = process_command,
1613 };
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
M
#define M(a, b)
Definition: vp3dsp.c:48
compute_dense
static void compute_dense(const DenseLayer *layer, float *output, const float *input)
Definition: af_arnndn.c:1251
ff_get_audio_buffer
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:98
AV_SAMPLE_FMT_FLTP
@ AV_SAMPLE_FMT_FLTP
float, planar
Definition: samplefmt.h:66
PITCH_MAX_PERIOD
#define PITCH_MAX_PERIOD
Definition: af_arnndn.c:52
pitch_downsample
static void pitch_downsample(float *x[], float *x_lp, int len, int C)
Definition: af_arnndn.c:741
WEIGHTS_SCALE
#define WEIGHTS_SCALE
Definition: af_arnndn.c:65
mix
static int mix(int c0, int c1)
Definition: 4xm.c:716
DenoiseState::synthesis_mem
float synthesis_mem[FRAME_SIZE]
Definition: af_arnndn.c:123
r
const char * r
Definition: vf_curves.c:127
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
activate
static int activate(AVFilterContext *ctx)
Definition: af_arnndn.c:1449
mem_internal.h
GRULayer::activation
int activation
Definition: af_arnndn.c:89
out
FILE * out
Definition: movenc.c:55
dual_inner_prod
static void dual_inner_prod(const float *x, const float *y01, const float *y02, int N, float *xy1, float *xy2)
Definition: af_arnndn.c:783
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1062
sample_fmts
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:948
FFERROR_NOT_READY
return FFERROR_NOT_READY
Definition: filter_design.txt:204
FREE_GRU
#define FREE_GRU(name)
AVTXContext
Definition: tx_priv.h:235
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
PITCH_MIN_PERIOD
#define PITCH_MIN_PERIOD
Definition: af_arnndn.c:51
av_frame_free
void av_frame_free(AVFrame **frame)
Free the frame and any dynamically allocated objects in it, e.g.
Definition: frame.c:162
GRULayer::nb_neurons
int nb_neurons
Definition: af_arnndn.c:88
RNNState::noise_gru_state
float * noise_gru_state
Definition: af_arnndn.c:114
uninit
static av_cold void uninit(AVFilterContext *ctx)
Definition: af_arnndn.c:1566
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: filters.h:262
inverse_transform
static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
Definition: af_arnndn.c:424
sample_rates
static const int sample_rates[]
Definition: dcaenc.h:34
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:389
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
DenoiseState::lastg
float lastg[NB_BANDS]
Definition: af_arnndn.c:129
AVOption
AVOption.
Definition: opt.h:429
OFFSET
#define OFFSET(x)
Definition: af_arnndn.c:1587
b
#define b
Definition: input.c:41
arnndn_options
static const AVOption arnndn_options[]
Definition: af_arnndn.c:1590
frame_synthesis
static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
Definition: af_arnndn.c:509
NB_DELTA_CEPS
#define NB_DELTA_CEPS
Definition: af_arnndn.c:61
RNNModel::input_dense_size
int input_dense_size
Definition: af_arnndn.c:93
AVComplexFloat
Definition: tx.h:27
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:205
c1
static const uint64_t c1
Definition: murmur3.c:52
ThreadData::out
AVFrame * out
Definition: af_adeclick.c:526
AVChannelLayout::nb_channels
int nb_channels
Number of channels in this layout.
Definition: channel_layout.h:327
ThreadData::in
AVFrame * in
Definition: af_adecorrelate.c:155
tansig_table
static const float tansig_table[201]
Definition: af_arnndn.c:1174
find_best_pitch
static void find_best_pitch(float *xcorr, float *y, int len, int max_pitch, int *best_pitch)
Definition: af_arnndn.c:906
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:434
process_command
static int process_command(AVFilterContext *ctx, const char *cmd, const char *args, char *res, int res_len, int flags)
Definition: af_arnndn.c:1536
av_tx_init
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags)
Initialize a transform context with the given configuration (i)MDCTs with an odd length are currently...
Definition: tx.c:903
DenoiseState::memid
int memid
Definition: af_arnndn.c:122
RNN_CLEAR
#define RNN_CLEAR(dst, n)
Definition: af_arnndn.c:406
GRULayer::nb_inputs
int nb_inputs
Definition: af_arnndn.c:87
compute_band_energy
static void compute_band_energy(float *bandE, const AVComplexFloat *X)
Definition: af_arnndn.c:447
state
static struct @464 state
formats.h
compute_rnn
static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
Definition: af_arnndn.c:1332
DenoiseState::txi
AVTXContext * txi
Definition: af_arnndn.c:132
free_model
static void free_model(AVFilterContext *ctx, int n)
Definition: af_arnndn.c:1522
RNNState::denoise_gru_state
float * denoise_gru_state
Definition: af_arnndn.c:115
rnnoise_channels
static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
Definition: af_arnndn.c:1408
ACTIVATION_RELU
#define ACTIVATION_RELU
Definition: af_arnndn.c:71
AVComplexFloat::im
float im
Definition: tx.h:28
DenoiseState::mem_hp_x
float mem_hp_x[2]
Definition: af_arnndn.c:128
window
static SDL_Window * window
Definition: ffplay.c:361
cosf
#define cosf(x)
Definition: libm.h:78
log10f
#define log10f(x)
Definition: libm.h:414
AudioRNNContext::model
RNNModel * model[2]
Definition: af_arnndn.c:148
rnnoise_model_free
static void rnnoise_model_free(RNNModel *model)
Definition: af_arnndn.c:157
AudioRNNContext::st
DenoiseState * st
Definition: af_arnndn.c:143
DenoiseState::cepstral_mem
float cepstral_mem[CEPS_MEM][NB_BANDS]
Definition: af_arnndn.c:121
SQUARE
#define SQUARE(x)
Definition: af_arnndn.c:56
AF
#define AF
Definition: af_arnndn.c:1588
DenseLayer::bias
const float * bias
Definition: af_arnndn.c:76
AVFilterPad
A filter pad used for either input or output.
Definition: filters.h:38
FREQ_SIZE
#define FREQ_SIZE
Definition: af_arnndn.c:49
T
#define T(x)
Definition: vpx_arith.h:29
compute_band_corr
static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
Definition: af_arnndn.c:472
DenoiseState::history
float history[FRAME_SIZE]
Definition: af_arnndn.c:130
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
avassert.h
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:209
av_cold
#define av_cold
Definition: attributes.h:90
av_tx_fn
void(* av_tx_fn)(AVTXContext *s, void *out, void *in, ptrdiff_t stride)
Function pointer to a function to perform the transform.
Definition: tx.h:151
float
float
Definition: af_crystalizer.c:122
MAX_NEURONS
#define MAX_NEURONS
Definition: af_arnndn.c:67
s
#define s(width, name)
Definition: cbs_vp9.c:198
frame_analysis
static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
Definition: af_arnndn.c:497
DenseLayer::nb_inputs
int nb_inputs
Definition: af_arnndn.c:78
CEPS_MEM
#define CEPS_MEM
Definition: af_arnndn.c:60
floor
static __device__ float floor(float a)
Definition: cuda_runtime.h:173
inputs
static const AVFilterPad inputs[]
Definition: af_arnndn.c:1579
g
const char * g
Definition: vf_curves.c:128
celt_inner_prod
static float celt_inner_prod(const float *x, const float *y, int N)
Definition: af_arnndn.c:595
AVMEDIA_TYPE_AUDIO
@ AVMEDIA_TYPE_AUDIO
Definition: avutil.h:202
av_assert0
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:40
filters.h
AV_TX_FLOAT_FFT
@ AV_TX_FLOAT_FFT
Standard complex to complex FFT with sample data type of AVComplexFloat, AVComplexDouble or AVComplex...
Definition: tx.h:47
ff_set_common_samplerates_from_list2
int ff_set_common_samplerates_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *samplerates)
Definition: formats.c:944
ctx
AVFormatContext * ctx
Definition: movenc.c:49
RNNModel::vad_gru_size
int vad_gru_size
Definition: af_arnndn.c:96
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h2645.c:418
rnnoise_model_from_file
static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
Definition: af_arnndn.c:187
ff_af_arnndn
const AVFilter ff_af_arnndn
Definition: af_arnndn.c:1599
config_input
static int config_input(AVFilterLink *inlink)
Definition: af_arnndn.c:347
FRAME_SIZE_SHIFT
#define FRAME_SIZE_SHIFT
Definition: af_arnndn.c:46
ACTIVATION_TANH
#define ACTIVATION_TANH
Definition: af_arnndn.c:69
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: filters.h:263
file_open.h
E
#define E
Definition: avdct.c:33
arg
const char * arg
Definition: jacosubdec.c:67
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
if
if(ret)
Definition: filter_design.txt:179
RNNModel::vad_gru
const GRULayer * vad_gru
Definition: af_arnndn.c:97
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:75
ff_inlink_consume_samples
int ff_inlink_consume_samples(AVFilterLink *link, unsigned min, unsigned max, AVFrame **rframe)
Take samples from the link's FIFO and update the link's stats.
Definition: avfilter.c:1511
NULL
#define NULL
Definition: coverity.c:32
LOCAL_ALIGNED_32
#define LOCAL_ALIGNED_32(t, v,...)
Definition: mem_internal.h:156
av_frame_copy_props
int av_frame_copy_props(AVFrame *dst, const AVFrame *src)
Copy only "metadata" fields from src to dst.
Definition: frame.c:713
sigmoid_approx
static float sigmoid_approx(float x)
Definition: af_arnndn.c:1246
RNNModel::denoise_gru_size
int denoise_gru_size
Definition: af_arnndn.c:102
RNNModel::vad_output
const DenseLayer * vad_output
Definition: af_arnndn.c:109
isnan
#define isnan(x)
Definition: libm.h:340
GRULayer::recurrent_weights
const float * recurrent_weights
Definition: af_arnndn.c:86
FREE_DENSE
#define FREE_DENSE(name)
PITCH_BUF_SIZE
#define PITCH_BUF_SIZE
Definition: af_arnndn.c:54
ff_audio_default_filterpad
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
Definition: audio.c:34
sqrtf
static __device__ float sqrtf(float a)
Definition: cuda_runtime.h:184
PITCH_FRAME_SIZE
#define PITCH_FRAME_SIZE
Definition: af_arnndn.c:53
av_clipf
av_clipf
Definition: af_crystalizer.c:122
RNNModel::input_dense
const DenseLayer * input_dense
Definition: af_arnndn.c:94
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AVFilterFormatsConfig
Lists of formats / etc.
Definition: avfilter.h:111
DenseLayer::input_weights
const float * input_weights
Definition: af_arnndn.c:77
float_dsp.h
biquad
static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N)
Definition: af_arnndn.c:391
DenoiseState::pitch_buf
float pitch_buf[PITCH_BUF_SIZE]
Definition: af_arnndn.c:124
f
f
Definition: af_crystalizer.c:122
INPUT_SIZE
#define INPUT_SIZE
Definition: af_arnndn.c:1330
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
NB_BANDS
#define NB_BANDS
Definition: af_arnndn.c:58
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:109
P
#define P
shift
static int shift(int a, int b)
Definition: bonk.c:261
DenseLayer::nb_neurons
int nb_neurons
Definition: af_arnndn.c:79
AV_SAMPLE_FMT_NONE
@ AV_SAMPLE_FMT_NONE
Definition: samplefmt.h:56
celt_autocorr
static int celt_autocorr(const float *x, float *ac, const float *window, int overlap, int lag, int n)
Definition: af_arnndn.c:627
WINDOW_SIZE
#define WINDOW_SIZE
Definition: af_arnndn.c:48
AVComplexFloat::re
float re
Definition: tx.h:28
AudioRNNContext::mix
float mix
Definition: af_arnndn.c:140
AVFloatDSPContext
Definition: float_dsp.h:24
RNNModel::noise_gru_size
int noise_gru_size
Definition: af_arnndn.c:99
celt_lpc
static void celt_lpc(float *lpc, const float *ac, int p)
Definition: af_arnndn.c:665
ff_filter_process_command
int ff_filter_process_command(AVFilterContext *ctx, const char *cmd, const char *arg, char *res, int res_len, int flags)
Generic processing of user supplied commands that are set in the same way as the filter options.
Definition: avfilter.c:901
DenoiseState::rnn
RNNState rnn[2]
Definition: af_arnndn.c:131
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
RNN_MOVE
#define RNN_MOVE(dst, src, n)
Definition: af_arnndn.c:405
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
N
#define N
Definition: af_mcompand.c:54
RNNModel::denoise_gru
const GRULayer * denoise_gru
Definition: af_arnndn.c:103
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
DenoiseState::last_gain
float last_gain
Definition: af_arnndn.c:126
M_PI
#define M_PI
Definition: mathematics.h:67
av_tx_uninit
av_cold void av_tx_uninit(AVTXContext **ctx)
Frees a context and sets *ctx to NULL, does nothing when *ctx == NULL.
Definition: tx.c:295
AudioRNNContext::channels
int channels
Definition: af_arnndn.c:142
DenoiseState::tx
AVTXContext * tx
Definition: af_arnndn.c:132
ACTIVATION_SIGMOID
#define ACTIVATION_SIGMOID
Definition: af_arnndn.c:70
AudioRNNContext::model_name
char * model_name
Definition: af_arnndn.c:139
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Underlying C type is float.
Definition: opt.h:271
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
DenoiseState
Definition: af_arnndn.c:119
RNN_COPY
#define RNN_COPY(dst, src, n)
Definition: af_arnndn.c:407
AVFrame::extended_data
uint8_t ** extended_data
pointers to the data planes/channels.
Definition: frame.h:450
ff_filter_get_nb_threads
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
Definition: avfilter.c:841
AVSampleFormat
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:55
ThreadData
Used for passing data between threads.
Definition: dsddec.c:71
interp_band_gain
static void interp_band_gain(float *g, const float *bandE)
Definition: af_arnndn.c:1128
FILTER_QUERY_FUNC2
#define FILTER_QUERY_FUNC2(func)
Definition: filters.h:239
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
dct
static void dct(AudioRNNContext *s, float *out, const float *in)
Definition: af_arnndn.c:1010
AudioRNNContext
Definition: af_arnndn.c:136
FRAME_SIZE
#define FRAME_SIZE
Definition: af_arnndn.c:47
len
int len
Definition: vorbis_enc_data.h:426
AudioRNNContext::dct_table
float dct_table[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)]
Definition: af_arnndn.c:146
AVFilterPad::name
const char * name
Pad name.
Definition: filters.h:44
avpriv_fopen_utf8
FILE * avpriv_fopen_utf8(const char *path, const char *mode)
Open a file using a UTF-8 filename.
Definition: file_open.c:161
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
stride
#define stride
Definition: h264pred_template.c:537
AVFilter
Filter definition.
Definition: avfilter.h:201
open_model
static int open_model(AVFilterContext *ctx, RNNModel **model)
Definition: af_arnndn.c:1471
ret
ret
Definition: filter_design.txt:187
RNNModel
Definition: af_arnndn.c:92
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
compute_frame_features
static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P, float *Ex, float *Ep, float *Exp, float *features, const float *in)
Definition: af_arnndn.c:1020
DenseLayer
Definition: af_arnndn.c:75
GRULayer::input_weights
const float * input_weights
Definition: af_arnndn.c:85
query_formats
static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
Definition: af_arnndn.c:330
AudioRNNContext::window
float window[WINDOW_SIZE]
Definition: af_arnndn.c:145
second_check
static const uint8_t second_check[16]
Definition: af_arnndn.c:802
remove_doubling
static float remove_doubling(float *x, int maxperiod, int minperiod, int N, int *T0_, int prev_period, float prev_gain)
Definition: af_arnndn.c:803
RNNModel::denoise_output_size
int denoise_output_size
Definition: af_arnndn.c:105
ff_set_common_formats_from_list2
int ff_set_common_formats_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *fmts)
Definition: formats.c:1016
compute_pitch_gain
static float compute_pitch_gain(float xy, float xx, float yy)
Definition: af_arnndn.c:797
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(arnndn)
ff_filter_execute
int ff_filter_execute(AVFilterContext *ctx, avfilter_action_func *func, void *arg, int *ret, int nb_jobs)
Definition: avfilter.c:1667
xcorr_kernel
static void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
Definition: af_arnndn.c:526
RNNModel::vad_output_size
int vad_output_size
Definition: af_arnndn.c:108
pitch_search
static void pitch_search(const float *x_lp, float *y, int len, int max_pitch, int *pitch)
Definition: af_arnndn.c:953
pitch_filter
static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep, const float *Exp, const float *g)
Definition: af_arnndn.c:1143
avfilter.h
celt_pitch_xcorr
static void celt_pitch_xcorr(const float *x, const float *y, float *xcorr, int len, int max_pitch)
Definition: af_arnndn.c:606
RNNState::vad_gru_state
float * vad_gru_state
Definition: af_arnndn.c:113
INPUT_GRU
#define INPUT_GRU(name)
rnnoise_channel
static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in, int disabled)
Definition: af_arnndn.c:1360
celt_fir5
static void celt_fir5(const float *x, const float *num, float *y, int N, float *mem)
Definition: af_arnndn.c:698
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
Definition: af_arnndn.c:1427
AVFilterContext
An instance of a filter.
Definition: avfilter.h:457
DenoiseState::pitch_enh_buf
float pitch_enh_buf[PITCH_BUF_SIZE]
Definition: af_arnndn.c:125
X
@ X
Definition: vf_addroi.c:27
AVFILTER_FLAG_SLICE_THREADS
#define AVFILTER_FLAG_SLICE_THREADS
The filter supports multithreading by splitting frames into multiple parts and processing them concur...
Definition: avfilter.h:152
tansig_approx
static float tansig_approx(float x)
Definition: af_arnndn.c:1218
AudioRNNContext::fdsp
AVFloatDSPContext * fdsp
Definition: af_arnndn.c:150
Q15ONE
#define Q15ONE
Definition: af_arnndn.c:73
DenoiseState::last_period
int last_period
Definition: af_arnndn.c:127
mem.h
audio.h
DenoiseState::tx_fn
av_tx_fn tx_fn
Definition: af_arnndn.c:133
forward_transform
static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
Definition: af_arnndn.c:409
av_free
#define av_free(p)
Definition: tableprint_vlc.h:33
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
FF_FILTER_FORWARD_STATUS
FF_FILTER_FORWARD_STATUS(inlink, outlink)
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:34
avpriv_float_dsp_alloc
av_cold AVFloatDSPContext * avpriv_float_dsp_alloc(int bit_exact)
Allocate a float DSP context.
Definition: float_dsp.c:146
DenoiseState::txi_fn
av_tx_fn txi_fn
Definition: af_arnndn.c:133
AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL
#define AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL
Same as AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, except that the filter will have its filter_frame() c...
Definition: avfilter.h:190
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:482
DenseLayer::activation
int activation
Definition: af_arnndn.c:80
RNNModel::denoise_output
const DenseLayer * denoise_output
Definition: af_arnndn.c:106
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
AVERROR_INVALIDDATA
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:61
h
h
Definition: vp9dsp_template.c:2070
RNNState
Definition: af_arnndn.c:112
ALLOC_LAYER
#define ALLOC_LAYER(type, name)
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
Definition: opt.h:276
GRULayer
Definition: af_arnndn.c:83
compute_gru
static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
Definition: af_arnndn.c:1279
eband5ms
static const uint8_t eband5ms[]
Definition: af_arnndn.c:442
GRULayer::bias
const float * bias
Definition: af_arnndn.c:84
INPUT_DENSE
#define INPUT_DENSE(name)
RNNModel::noise_gru
const GRULayer * noise_gru
Definition: af_arnndn.c:100
NB_FEATURES
#define NB_FEATURES
Definition: af_arnndn.c:63
src
#define src
Definition: vp8dsp.c:248
init
static av_cold int init(AVFilterContext *ctx)
Definition: af_arnndn.c:1493
tx.h
RNNState::model
RNNModel * model
Definition: af_arnndn.c:116
DenoiseState::analysis_mem
float analysis_mem[FRAME_SIZE]
Definition: af_arnndn.c:120