FFmpeg
af_arnndn.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018 Gregor Richards
3  * Copyright (c) 2017 Mozilla
4  * Copyright (c) 2005-2009 Xiph.Org Foundation
5  * Copyright (c) 2007-2008 CSIRO
6  * Copyright (c) 2008-2011 Octasic Inc.
7  * Copyright (c) Jean-Marc Valin
8  * Copyright (c) 2019 Paul B Mahol
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * - Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  *
17  * - Redistributions in binary form must reproduce the above copyright
18  * notice, this list of conditions and the following disclaimer in the
19  * documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "libavutil/avassert.h"
35 #include "libavutil/file_open.h"
36 #include "libavutil/float_dsp.h"
37 #include "libavutil/mem_internal.h"
38 #include "libavutil/opt.h"
39 #include "libavutil/tx.h"
40 #include "avfilter.h"
41 #include "audio.h"
42 #include "filters.h"
43 #include "formats.h"
44 
45 #define FRAME_SIZE_SHIFT 2
46 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
47 #define WINDOW_SIZE (2*FRAME_SIZE)
48 #define FREQ_SIZE (FRAME_SIZE + 1)
49 
50 #define PITCH_MIN_PERIOD 60
51 #define PITCH_MAX_PERIOD 768
52 #define PITCH_FRAME_SIZE 960
53 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
54 
55 #define SQUARE(x) ((x)*(x))
56 
57 #define NB_BANDS 22
58 
59 #define CEPS_MEM 8
60 #define NB_DELTA_CEPS 6
61 
62 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
63 
64 #define WEIGHTS_SCALE (1.f/256)
65 
66 #define MAX_NEURONS 128
67 
68 #define ACTIVATION_TANH 0
69 #define ACTIVATION_SIGMOID 1
70 #define ACTIVATION_RELU 2
71 
72 #define Q15ONE 1.0f
73 
74 typedef struct DenseLayer {
75  const float *bias;
76  const float *input_weights;
77  int nb_inputs;
80 } DenseLayer;
81 
82 typedef struct GRULayer {
83  const float *bias;
84  const float *input_weights;
85  const float *recurrent_weights;
86  int nb_inputs;
89 } GRULayer;
90 
91 typedef struct RNNModel {
94 
96  const GRULayer *vad_gru;
97 
100 
103 
106 
109 } RNNModel;
110 
111 typedef struct RNNState {
116 } RNNState;
117 
118 typedef struct DenoiseState {
121  int memid;
125  float last_gain;
127  float mem_hp_x[2];
128  float lastg[NB_BANDS];
133 } DenoiseState;
134 
135 typedef struct AudioRNNContext {
136  const AVClass *class;
137 
138  char *model_name;
139  float mix;
140 
141  int channels;
143 
146 
148 
151 
152 #define F_ACTIVATION_TANH 0
153 #define F_ACTIVATION_SIGMOID 1
154 #define F_ACTIVATION_RELU 2
155 
156 static void rnnoise_model_free(RNNModel *model)
157 {
158 #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)
159 #define FREE_DENSE(name) do { \
160  if (model->name) { \
161  av_free((void *) model->name->input_weights); \
162  av_free((void *) model->name->bias); \
163  av_free((void *) model->name); \
164  } \
165  } while (0)
166 #define FREE_GRU(name) do { \
167  if (model->name) { \
168  av_free((void *) model->name->input_weights); \
169  av_free((void *) model->name->recurrent_weights); \
170  av_free((void *) model->name->bias); \
171  av_free((void *) model->name); \
172  } \
173  } while (0)
174 
175  if (!model)
176  return;
177  FREE_DENSE(input_dense);
178  FREE_GRU(vad_gru);
179  FREE_GRU(noise_gru);
180  FREE_GRU(denoise_gru);
181  FREE_DENSE(denoise_output);
182  FREE_DENSE(vad_output);
183  av_free(model);
184 }
185 
186 static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
187 {
188  RNNModel *ret = NULL;
189  DenseLayer *input_dense;
190  GRULayer *vad_gru;
191  GRULayer *noise_gru;
192  GRULayer *denoise_gru;
193  DenseLayer *denoise_output;
194  DenseLayer *vad_output;
195  int in;
196 
197  if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
198  return AVERROR_INVALIDDATA;
199 
200  ret = av_calloc(1, sizeof(RNNModel));
201  if (!ret)
202  return AVERROR(ENOMEM);
203 
204 #define ALLOC_LAYER(type, name) \
205  name = av_calloc(1, sizeof(type)); \
206  if (!name) { \
207  rnnoise_model_free(ret); \
208  return AVERROR(ENOMEM); \
209  } \
210  ret->name = name
211 
212  ALLOC_LAYER(DenseLayer, input_dense);
213  ALLOC_LAYER(GRULayer, vad_gru);
214  ALLOC_LAYER(GRULayer, noise_gru);
215  ALLOC_LAYER(GRULayer, denoise_gru);
216  ALLOC_LAYER(DenseLayer, denoise_output);
217  ALLOC_LAYER(DenseLayer, vad_output);
218 
219 #define INPUT_VAL(name) do { \
220  if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
221  rnnoise_model_free(ret); \
222  return AVERROR(EINVAL); \
223  } \
224  name = in; \
225  } while (0)
226 
227 #define INPUT_ACTIVATION(name) do { \
228  int activation; \
229  INPUT_VAL(activation); \
230  switch (activation) { \
231  case F_ACTIVATION_SIGMOID: \
232  name = ACTIVATION_SIGMOID; \
233  break; \
234  case F_ACTIVATION_RELU: \
235  name = ACTIVATION_RELU; \
236  break; \
237  default: \
238  name = ACTIVATION_TANH; \
239  } \
240  } while (0)
241 
242 #define INPUT_ARRAY(name, len) do { \
243  float *values = av_calloc((len), sizeof(float)); \
244  if (!values) { \
245  rnnoise_model_free(ret); \
246  return AVERROR(ENOMEM); \
247  } \
248  name = values; \
249  for (int i = 0; i < (len); i++) { \
250  if (fscanf(f, "%d", &in) != 1) { \
251  rnnoise_model_free(ret); \
252  return AVERROR(EINVAL); \
253  } \
254  values[i] = in; \
255  } \
256  } while (0)
257 
258 #define INPUT_ARRAY3(name, len0, len1, len2) do { \
259  float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
260  if (!values) { \
261  rnnoise_model_free(ret); \
262  return AVERROR(ENOMEM); \
263  } \
264  name = values; \
265  for (int k = 0; k < (len0); k++) { \
266  for (int i = 0; i < (len2); i++) { \
267  for (int j = 0; j < (len1); j++) { \
268  if (fscanf(f, "%d", &in) != 1) { \
269  rnnoise_model_free(ret); \
270  return AVERROR(EINVAL); \
271  } \
272  values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
273  } \
274  } \
275  } \
276  } while (0)
277 
278 #define NEW_LINE() do { \
279  int c; \
280  while ((c = fgetc(f)) != EOF) { \
281  if (c == '\n') \
282  break; \
283  } \
284  } while (0)
285 
286 #define INPUT_DENSE(name) do { \
287  INPUT_VAL(name->nb_inputs); \
288  INPUT_VAL(name->nb_neurons); \
289  ret->name ## _size = name->nb_neurons; \
290  INPUT_ACTIVATION(name->activation); \
291  NEW_LINE(); \
292  INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
293  NEW_LINE(); \
294  INPUT_ARRAY(name->bias, name->nb_neurons); \
295  NEW_LINE(); \
296  } while (0)
297 
298 #define INPUT_GRU(name) do { \
299  INPUT_VAL(name->nb_inputs); \
300  INPUT_VAL(name->nb_neurons); \
301  ret->name ## _size = name->nb_neurons; \
302  INPUT_ACTIVATION(name->activation); \
303  NEW_LINE(); \
304  INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
305  NEW_LINE(); \
306  INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
307  NEW_LINE(); \
308  INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
309  NEW_LINE(); \
310  } while (0)
311 
312  INPUT_DENSE(input_dense);
313  INPUT_GRU(vad_gru);
314  INPUT_GRU(noise_gru);
315  INPUT_GRU(denoise_gru);
316  INPUT_DENSE(denoise_output);
317  INPUT_DENSE(vad_output);
318 
319  if (vad_output->nb_neurons != 1) {
321  return AVERROR(EINVAL);
322  }
323 
324  *rnn = ret;
325 
326  return 0;
327 }
328 
330 {
331  static const enum AVSampleFormat sample_fmts[] = {
334  };
335  int ret, sample_rates[] = { 48000, -1 };
336 
338  if (ret < 0)
339  return ret;
340 
342  if (ret < 0)
343  return ret;
344 
346 }
347 
349 {
350  AVFilterContext *ctx = inlink->dst;
351  AudioRNNContext *s = ctx->priv;
352  int ret = 0;
353 
354  s->channels = inlink->ch_layout.nb_channels;
355 
356  if (!s->st)
357  s->st = av_calloc(s->channels, sizeof(DenoiseState));
358  if (!s->st)
359  return AVERROR(ENOMEM);
360 
361  for (int i = 0; i < s->channels; i++) {
362  DenoiseState *st = &s->st[i];
363 
364  st->rnn[0].model = s->model[0];
365  st->rnn[0].vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->vad_gru_size, 16));
366  st->rnn[0].noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->noise_gru_size, 16));
367  st->rnn[0].denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->denoise_gru_size, 16));
368  if (!st->rnn[0].vad_gru_state ||
369  !st->rnn[0].noise_gru_state ||
370  !st->rnn[0].denoise_gru_state)
371  return AVERROR(ENOMEM);
372  }
373 
374  for (int i = 0; i < s->channels; i++) {
375  DenoiseState *st = &s->st[i];
376  float scale = 1.f;
377 
378  if (!st->tx)
379  ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, &scale, 0);
380  if (ret < 0)
381  return ret;
382 
383  if (!st->txi)
384  ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, &scale, 0);
385  if (ret < 0)
386  return ret;
387  }
388 
389  return ret;
390 }
391 
392 static void biquad(float *y, float mem[2], const float *x,
393  const float *b, const float *a, int N)
394 {
395  for (int i = 0; i < N; i++) {
396  float xi, yi;
397 
398  xi = x[i];
399  yi = x[i] + mem[0];
400  mem[0] = mem[1] + (b[0]*xi - a[0]*yi);
401  mem[1] = (b[1]*xi - a[1]*yi);
402  y[i] = yi;
403  }
404 }
405 
406 #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
407 #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
408 #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
409 
410 static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
411 {
414 
415  for (int i = 0; i < WINDOW_SIZE; i++) {
416  x[i].re = in[i];
417  x[i].im = 0;
418  }
419 
420  st->tx_fn(st->tx, y, x, sizeof(AVComplexFloat));
421 
422  RNN_COPY(out, y, FREQ_SIZE);
423 }
424 
425 static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
426 {
429 
430  RNN_COPY(x, in, FREQ_SIZE);
431 
432  for (int i = FREQ_SIZE; i < WINDOW_SIZE; i++) {
433  x[i].re = x[WINDOW_SIZE - i].re;
434  x[i].im = -x[WINDOW_SIZE - i].im;
435  }
436 
437  st->txi_fn(st->txi, y, x, sizeof(AVComplexFloat));
438 
439  for (int i = 0; i < WINDOW_SIZE; i++)
440  out[i] = y[i].re / WINDOW_SIZE;
441 }
442 
443 static const uint8_t eband5ms[] = {
444 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
445  0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
446 };
447 
448 static void compute_band_energy(float *bandE, const AVComplexFloat *X)
449 {
450  float sum[NB_BANDS] = {0};
451 
452  for (int i = 0; i < NB_BANDS - 1; i++) {
453  int band_size;
454 
455  band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
456  for (int j = 0; j < band_size; j++) {
457  float tmp, frac = (float)j / band_size;
458 
459  tmp = SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].re);
460  tmp += SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].im);
461  sum[i] += (1.f - frac) * tmp;
462  sum[i + 1] += frac * tmp;
463  }
464  }
465 
466  sum[0] *= 2;
467  sum[NB_BANDS - 1] *= 2;
468 
469  for (int i = 0; i < NB_BANDS; i++)
470  bandE[i] = sum[i];
471 }
472 
473 static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
474 {
475  float sum[NB_BANDS] = { 0 };
476 
477  for (int i = 0; i < NB_BANDS - 1; i++) {
478  int band_size;
479 
480  band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
481  for (int j = 0; j < band_size; j++) {
482  float tmp, frac = (float)j / band_size;
483 
484  tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re;
485  tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im;
486  sum[i] += (1 - frac) * tmp;
487  sum[i + 1] += frac * tmp;
488  }
489  }
490 
491  sum[0] *= 2;
492  sum[NB_BANDS-1] *= 2;
493 
494  for (int i = 0; i < NB_BANDS; i++)
495  bandE[i] = sum[i];
496 }
497 
498 static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
499 {
500  LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
501 
503  RNN_COPY(x + FRAME_SIZE, in, FRAME_SIZE);
504  RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
505  s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
506  forward_transform(st, X, x);
507  compute_band_energy(Ex, X);
508 }
509 
510 static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
511 {
512  LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
513  const float *src = st->history;
514  const float mix = s->mix;
515  const float imix = 1.f - FFMAX(mix, 0.f);
516 
517  inverse_transform(st, x, y);
518  s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
519  s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);
520  RNN_COPY(out, x, FRAME_SIZE);
522 
523  for (int n = 0; n < FRAME_SIZE; n++)
524  out[n] = out[n] * mix + src[n] * imix;
525 }
526 
527 static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
528 {
529  float y_0, y_1, y_2, y_3 = 0;
530  int j;
531 
532  y_0 = *y++;
533  y_1 = *y++;
534  y_2 = *y++;
535 
536  for (j = 0; j < len - 3; j += 4) {
537  float tmp;
538 
539  tmp = *x++;
540  y_3 = *y++;
541  sum[0] += tmp * y_0;
542  sum[1] += tmp * y_1;
543  sum[2] += tmp * y_2;
544  sum[3] += tmp * y_3;
545  tmp = *x++;
546  y_0 = *y++;
547  sum[0] += tmp * y_1;
548  sum[1] += tmp * y_2;
549  sum[2] += tmp * y_3;
550  sum[3] += tmp * y_0;
551  tmp = *x++;
552  y_1 = *y++;
553  sum[0] += tmp * y_2;
554  sum[1] += tmp * y_3;
555  sum[2] += tmp * y_0;
556  sum[3] += tmp * y_1;
557  tmp = *x++;
558  y_2 = *y++;
559  sum[0] += tmp * y_3;
560  sum[1] += tmp * y_0;
561  sum[2] += tmp * y_1;
562  sum[3] += tmp * y_2;
563  }
564 
565  if (j++ < len) {
566  float tmp = *x++;
567 
568  y_3 = *y++;
569  sum[0] += tmp * y_0;
570  sum[1] += tmp * y_1;
571  sum[2] += tmp * y_2;
572  sum[3] += tmp * y_3;
573  }
574 
575  if (j++ < len) {
576  float tmp=*x++;
577 
578  y_0 = *y++;
579  sum[0] += tmp * y_1;
580  sum[1] += tmp * y_2;
581  sum[2] += tmp * y_3;
582  sum[3] += tmp * y_0;
583  }
584 
585  if (j < len) {
586  float tmp=*x++;
587 
588  y_1 = *y++;
589  sum[0] += tmp * y_2;
590  sum[1] += tmp * y_3;
591  sum[2] += tmp * y_0;
592  sum[3] += tmp * y_1;
593  }
594 }
595 
596 static inline float celt_inner_prod(const float *x,
597  const float *y, int N)
598 {
599  float xy = 0.f;
600 
601  for (int i = 0; i < N; i++)
602  xy += x[i] * y[i];
603 
604  return xy;
605 }
606 
607 static void celt_pitch_xcorr(const float *x, const float *y,
608  float *xcorr, int len, int max_pitch)
609 {
610  int i;
611 
612  for (i = 0; i < max_pitch - 3; i += 4) {
613  float sum[4] = { 0, 0, 0, 0};
614 
615  xcorr_kernel(x, y + i, sum, len);
616 
617  xcorr[i] = sum[0];
618  xcorr[i + 1] = sum[1];
619  xcorr[i + 2] = sum[2];
620  xcorr[i + 3] = sum[3];
621  }
622  /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
623  for (; i < max_pitch; i++) {
624  xcorr[i] = celt_inner_prod(x, y + i, len);
625  }
626 }
627 
628 static int celt_autocorr(const float *x, /* in: [0...n-1] samples x */
629  float *ac, /* out: [0...lag-1] ac values */
630  const float *window,
631  int overlap,
632  int lag,
633  int n)
634 {
635  int fastN = n - lag;
636  int shift;
637  const float *xptr;
638  float xx[PITCH_BUF_SIZE>>1];
639 
640  if (overlap == 0) {
641  xptr = x;
642  } else {
643  for (int i = 0; i < n; i++)
644  xx[i] = x[i];
645  for (int i = 0; i < overlap; i++) {
646  xx[i] = x[i] * window[i];
647  xx[n-i-1] = x[n-i-1] * window[i];
648  }
649  xptr = xx;
650  }
651 
652  shift = 0;
653  celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
654 
655  for (int k = 0; k <= lag; k++) {
656  float d = 0.f;
657 
658  for (int i = k + fastN; i < n; i++)
659  d += xptr[i] * xptr[i-k];
660  ac[k] += d;
661  }
662 
663  return shift;
664 }
665 
666 static void celt_lpc(float *lpc, /* out: [0...p-1] LPC coefficients */
667  const float *ac, /* in: [0...p] autocorrelation values */
668  int p)
669 {
670  float r, error = ac[0];
671 
672  RNN_CLEAR(lpc, p);
673  if (ac[0] != 0) {
674  for (int i = 0; i < p; i++) {
675  /* Sum up this iteration's reflection coefficient */
676  float rr = 0;
677  for (int j = 0; j < i; j++)
678  rr += (lpc[j] * ac[i - j]);
679  rr += ac[i + 1];
680  r = -rr/error;
681  /* Update LPC coefficients and total error */
682  lpc[i] = r;
683  for (int j = 0; j < (i + 1) >> 1; j++) {
684  float tmp1, tmp2;
685  tmp1 = lpc[j];
686  tmp2 = lpc[i-1-j];
687  lpc[j] = tmp1 + (r*tmp2);
688  lpc[i-1-j] = tmp2 + (r*tmp1);
689  }
690 
691  error = error - (r * r *error);
692  /* Bail out once we get 30 dB gain */
693  if (error < .001f * ac[0])
694  break;
695  }
696  }
697 }
698 
699 static void celt_fir5(const float *x,
700  const float *num,
701  float *y,
702  int N,
703  float *mem)
704 {
705  float num0, num1, num2, num3, num4;
706  float mem0, mem1, mem2, mem3, mem4;
707 
708  num0 = num[0];
709  num1 = num[1];
710  num2 = num[2];
711  num3 = num[3];
712  num4 = num[4];
713  mem0 = mem[0];
714  mem1 = mem[1];
715  mem2 = mem[2];
716  mem3 = mem[3];
717  mem4 = mem[4];
718 
719  for (int i = 0; i < N; i++) {
720  float sum = x[i];
721 
722  sum += (num0*mem0);
723  sum += (num1*mem1);
724  sum += (num2*mem2);
725  sum += (num3*mem3);
726  sum += (num4*mem4);
727  mem4 = mem3;
728  mem3 = mem2;
729  mem2 = mem1;
730  mem1 = mem0;
731  mem0 = x[i];
732  y[i] = sum;
733  }
734 
735  mem[0] = mem0;
736  mem[1] = mem1;
737  mem[2] = mem2;
738  mem[3] = mem3;
739  mem[4] = mem4;
740 }
741 
742 static void pitch_downsample(float *x[], float *x_lp,
743  int len, int C)
744 {
745  float ac[5];
746  float tmp=Q15ONE;
747  float lpc[4], mem[5]={0,0,0,0,0};
748  float lpc2[5];
749  float c1 = .8f;
750 
751  for (int i = 1; i < len >> 1; i++)
752  x_lp[i] = .5f * (.5f * (x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]);
753  x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);
754  if (C==2) {
755  for (int i = 1; i < len >> 1; i++)
756  x_lp[i] += (.5f * (.5f * (x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]));
757  x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);
758  }
759 
760  celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1);
761 
762  /* Noise floor -40 dB */
763  ac[0] *= 1.0001f;
764  /* Lag windowing */
765  for (int i = 1; i <= 4; i++) {
766  /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
767  ac[i] -= ac[i]*(.008f*i)*(.008f*i);
768  }
769 
770  celt_lpc(lpc, ac, 4);
771  for (int i = 0; i < 4; i++) {
772  tmp = .9f * tmp;
773  lpc[i] = (lpc[i] * tmp);
774  }
775  /* Add a zero */
776  lpc2[0] = lpc[0] + .8f;
777  lpc2[1] = lpc[1] + (c1 * lpc[0]);
778  lpc2[2] = lpc[2] + (c1 * lpc[1]);
779  lpc2[3] = lpc[3] + (c1 * lpc[2]);
780  lpc2[4] = (c1 * lpc[3]);
781  celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
782 }
783 
784 static inline void dual_inner_prod(const float *x, const float *y01, const float *y02,
785  int N, float *xy1, float *xy2)
786 {
787  float xy01 = 0, xy02 = 0;
788 
789  for (int i = 0; i < N; i++) {
790  xy01 += (x[i] * y01[i]);
791  xy02 += (x[i] * y02[i]);
792  }
793 
794  *xy1 = xy01;
795  *xy2 = xy02;
796 }
797 
798 static float compute_pitch_gain(float xy, float xx, float yy)
799 {
800  return xy / sqrtf(1.f + xx * yy);
801 }
802 
803 static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
804 static float remove_doubling(float *x, int maxperiod, int minperiod, int N,
805  int *T0_, int prev_period, float prev_gain)
806 {
807  int k, i, T, T0;
808  float g, g0;
809  float pg;
810  float xy,xx,yy,xy2;
811  float xcorr[3];
812  float best_xy, best_yy;
813  int offset;
814  int minperiod0;
815  float yy_lookup[PITCH_MAX_PERIOD+1];
816 
817  minperiod0 = minperiod;
818  maxperiod /= 2;
819  minperiod /= 2;
820  *T0_ /= 2;
821  prev_period /= 2;
822  N /= 2;
823  x += maxperiod;
824  if (*T0_>=maxperiod)
825  *T0_=maxperiod-1;
826 
827  T = T0 = *T0_;
828  dual_inner_prod(x, x, x-T0, N, &xx, &xy);
829  yy_lookup[0] = xx;
830  yy=xx;
831  for (i = 1; i <= maxperiod; i++) {
832  yy = yy+(x[-i] * x[-i])-(x[N-i] * x[N-i]);
833  yy_lookup[i] = FFMAX(0, yy);
834  }
835  yy = yy_lookup[T0];
836  best_xy = xy;
837  best_yy = yy;
838  g = g0 = compute_pitch_gain(xy, xx, yy);
839  /* Look for any pitch at T/k */
840  for (k = 2; k <= 15; k++) {
841  int T1, T1b;
842  float g1;
843  float cont=0;
844  float thresh;
845  T1 = (2*T0+k)/(2*k);
846  if (T1 < minperiod)
847  break;
848  /* Look for another strong correlation at T1b */
849  if (k==2)
850  {
851  if (T1+T0>maxperiod)
852  T1b = T0;
853  else
854  T1b = T0+T1;
855  } else
856  {
857  T1b = (2*second_check[k]*T0+k)/(2*k);
858  }
859  dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
860  xy = .5f * (xy + xy2);
861  yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);
862  g1 = compute_pitch_gain(xy, xx, yy);
863  if (FFABS(T1-prev_period)<=1)
864  cont = prev_gain;
865  else if (FFABS(T1-prev_period)<=2 && 5 * k * k < T0)
866  cont = prev_gain * .5f;
867  else
868  cont = 0;
869  thresh = FFMAX(.3f, (.7f * g0) - cont);
870  /* Bias against very high pitch (very short period) to avoid false-positives
871  due to short-term correlation */
872  if (T1<3*minperiod)
873  thresh = FFMAX(.4f, (.85f * g0) - cont);
874  else if (T1<2*minperiod)
875  thresh = FFMAX(.5f, (.9f * g0) - cont);
876  if (g1 > thresh)
877  {
878  best_xy = xy;
879  best_yy = yy;
880  T = T1;
881  g = g1;
882  }
883  }
884  best_xy = FFMAX(0, best_xy);
885  if (best_yy <= best_xy)
886  pg = Q15ONE;
887  else
888  pg = best_xy/(best_yy + 1);
889 
890  for (k = 0; k < 3; k++)
891  xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
892  if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))
893  offset = 1;
894  else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))
895  offset = -1;
896  else
897  offset = 0;
898  if (pg > g)
899  pg = g;
900  *T0_ = 2*T+offset;
901 
902  if (*T0_<minperiod0)
903  *T0_=minperiod0;
904  return pg;
905 }
906 
907 static void find_best_pitch(float *xcorr, float *y, int len,
908  int max_pitch, int *best_pitch)
909 {
910  float best_num[2];
911  float best_den[2];
912  float Syy = 1.f;
913 
914  best_num[0] = -1;
915  best_num[1] = -1;
916  best_den[0] = 0;
917  best_den[1] = 0;
918  best_pitch[0] = 0;
919  best_pitch[1] = 1;
920 
921  for (int j = 0; j < len; j++)
922  Syy += y[j] * y[j];
923 
924  for (int i = 0; i < max_pitch; i++) {
925  if (xcorr[i]>0) {
926  float num;
927  float xcorr16;
928 
929  xcorr16 = xcorr[i];
930  /* Considering the range of xcorr16, this should avoid both underflows
931  and overflows (inf) when squaring xcorr16 */
932  xcorr16 *= 1e-12f;
933  num = xcorr16 * xcorr16;
934  if ((num * best_den[1]) > (best_num[1] * Syy)) {
935  if ((num * best_den[0]) > (best_num[0] * Syy)) {
936  best_num[1] = best_num[0];
937  best_den[1] = best_den[0];
938  best_pitch[1] = best_pitch[0];
939  best_num[0] = num;
940  best_den[0] = Syy;
941  best_pitch[0] = i;
942  } else {
943  best_num[1] = num;
944  best_den[1] = Syy;
945  best_pitch[1] = i;
946  }
947  }
948  }
949  Syy += y[i+len]*y[i+len] - y[i] * y[i];
950  Syy = FFMAX(1, Syy);
951  }
952 }
953 
954 static void pitch_search(const float *x_lp, float *y,
955  int len, int max_pitch, int *pitch)
956 {
957  int lag;
958  int best_pitch[2]={0,0};
959  int offset;
960 
961  float x_lp4[WINDOW_SIZE];
962  float y_lp4[WINDOW_SIZE];
963  float xcorr[WINDOW_SIZE];
964 
965  lag = len+max_pitch;
966 
967  /* Downsample by 2 again */
968  for (int j = 0; j < len >> 2; j++)
969  x_lp4[j] = x_lp[2*j];
970  for (int j = 0; j < lag >> 2; j++)
971  y_lp4[j] = y[2*j];
972 
973  /* Coarse search with 4x decimation */
974 
975  celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
976 
977  find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch);
978 
979  /* Finer search with 2x decimation */
980  for (int i = 0; i < max_pitch >> 1; i++) {
981  float sum;
982  xcorr[i] = 0;
983  if (FFABS(i-2*best_pitch[0])>2 && FFABS(i-2*best_pitch[1])>2)
984  continue;
985  sum = celt_inner_prod(x_lp, y+i, len>>1);
986  xcorr[i] = FFMAX(-1, sum);
987  }
988 
989  find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch);
990 
991  /* Refine by pseudo-interpolation */
992  if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {
993  float a, b, c;
994 
995  a = xcorr[best_pitch[0] - 1];
996  b = xcorr[best_pitch[0]];
997  c = xcorr[best_pitch[0] + 1];
998  if (c - a > .7f * (b - a))
999  offset = 1;
1000  else if (a - c > .7f * (b-c))
1001  offset = -1;
1002  else
1003  offset = 0;
1004  } else {
1005  offset = 0;
1006  }
1007 
1008  *pitch = 2 * best_pitch[0] - offset;
1009 }
1010 
1011 static void dct(AudioRNNContext *s, float *out, const float *in)
1012 {
1013  for (int i = 0; i < NB_BANDS; i++) {
1014  float sum;
1015 
1016  sum = s->fdsp->scalarproduct_float(in, s->dct_table[i], FFALIGN(NB_BANDS, 4));
1017  out[i] = sum * sqrtf(2.f / 22);
1018  }
1019 }
1020 
1022  float *Ex, float *Ep, float *Exp, float *features, const float *in)
1023 {
1024  float E = 0;
1025  float *ceps_0, *ceps_1, *ceps_2;
1026  float spec_variability = 0;
1027  LOCAL_ALIGNED_32(float, Ly, [NB_BANDS]);
1028  LOCAL_ALIGNED_32(float, p, [WINDOW_SIZE]);
1029  float pitch_buf[PITCH_BUF_SIZE>>1];
1030  int pitch_index;
1031  float gain;
1032  float *(pre[1]);
1033  float tmp[NB_BANDS];
1034  float follow, logMax;
1035 
1036  frame_analysis(s, st, X, Ex, in);
1039  pre[0] = &st->pitch_buf[0];
1040  pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
1041  pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
1042  PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
1043  pitch_index = PITCH_MAX_PERIOD-pitch_index;
1044 
1046  PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
1047  st->last_period = pitch_index;
1048  st->last_gain = gain;
1049 
1050  for (int i = 0; i < WINDOW_SIZE; i++)
1051  p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
1052 
1053  s->fdsp->vector_fmul(p, p, s->window, WINDOW_SIZE);
1054  forward_transform(st, P, p);
1055  compute_band_energy(Ep, P);
1056  compute_band_corr(Exp, X, P);
1057 
1058  for (int i = 0; i < NB_BANDS; i++)
1059  Exp[i] = Exp[i] / sqrtf(.001f+Ex[i]*Ep[i]);
1060 
1061  dct(s, tmp, Exp);
1062 
1063  for (int i = 0; i < NB_DELTA_CEPS; i++)
1064  features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
1065 
1066  features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
1067  features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
1068  features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
1069  logMax = -2;
1070  follow = -2;
1071 
1072  for (int i = 0; i < NB_BANDS; i++) {
1073  Ly[i] = log10f(1e-2f + Ex[i]);
1074  Ly[i] = FFMAX(logMax-7, FFMAX(follow-1.5, Ly[i]));
1075  logMax = FFMAX(logMax, Ly[i]);
1076  follow = FFMAX(follow-1.5, Ly[i]);
1077  E += Ex[i];
1078  }
1079 
1080  if (E < 0.04f) {
1081  /* If there's no audio, avoid messing up the state. */
1082  RNN_CLEAR(features, NB_FEATURES);
1083  return 1;
1084  }
1085 
1086  dct(s, features, Ly);
1087  features[0] -= 12;
1088  features[1] -= 4;
1089  ceps_0 = st->cepstral_mem[st->memid];
1090  ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
1091  ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
1092 
1093  for (int i = 0; i < NB_BANDS; i++)
1094  ceps_0[i] = features[i];
1095 
1096  st->memid++;
1097  for (int i = 0; i < NB_DELTA_CEPS; i++) {
1098  features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
1099  features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
1100  features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
1101  }
1102  /* Spectral variability features. */
1103  if (st->memid == CEPS_MEM)
1104  st->memid = 0;
1105 
1106  for (int i = 0; i < CEPS_MEM; i++) {
1107  float mindist = 1e15f;
1108  for (int j = 0; j < CEPS_MEM; j++) {
1109  float dist = 0.f;
1110  for (int k = 0; k < NB_BANDS; k++) {
1111  float tmp;
1112 
1113  tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
1114  dist += tmp*tmp;
1115  }
1116 
1117  if (j != i)
1118  mindist = FFMIN(mindist, dist);
1119  }
1120 
1121  spec_variability += mindist;
1122  }
1123 
1124  features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
1125 
1126  return 0;
1127 }
1128 
1129 static void interp_band_gain(float *g, const float *bandE)
1130 {
1131  memset(g, 0, sizeof(*g) * FREQ_SIZE);
1132 
1133  for (int i = 0; i < NB_BANDS - 1; i++) {
1134  const int band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
1135 
1136  for (int j = 0; j < band_size; j++) {
1137  float frac = (float)j / band_size;
1138 
1139  g[(eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1.f - frac) * bandE[i] + frac * bandE[i + 1];
1140  }
1141  }
1142 }
1143 
1144 static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep,
1145  const float *Exp, const float *g)
1146 {
1147  float newE[NB_BANDS];
1148  float r[NB_BANDS];
1149  float norm[NB_BANDS];
1150  float rf[FREQ_SIZE] = {0};
1151  float normf[FREQ_SIZE]={0};
1152 
1153  for (int i = 0; i < NB_BANDS; i++) {
1154  if (Exp[i]>g[i]) r[i] = 1;
1155  else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
1156  r[i] = sqrtf(av_clipf(r[i], 0, 1));
1157  r[i] *= sqrtf(Ex[i]/(1e-8+Ep[i]));
1158  }
1159  interp_band_gain(rf, r);
1160  for (int i = 0; i < FREQ_SIZE; i++) {
1161  X[i].re += rf[i]*P[i].re;
1162  X[i].im += rf[i]*P[i].im;
1163  }
1164  compute_band_energy(newE, X);
1165  for (int i = 0; i < NB_BANDS; i++) {
1166  norm[i] = sqrtf(Ex[i] / (1e-8+newE[i]));
1167  }
1168  interp_band_gain(normf, norm);
1169  for (int i = 0; i < FREQ_SIZE; i++) {
1170  X[i].re *= normf[i];
1171  X[i].im *= normf[i];
1172  }
1173 }
1174 
1175 static const float tansig_table[201] = {
1176  0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
1177  0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
1178  0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
1179  0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
1180  0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
1181  0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
1182  0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
1183  0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
1184  0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
1185  0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
1186  0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
1187  0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
1188  0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
1189  0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
1190  0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
1191  0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
1192  0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
1193  0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
1194  0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
1195  0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
1196  0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
1197  0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
1198  0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
1199  0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
1200  0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
1201  0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
1202  0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
1203  0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
1204  0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
1205  0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
1206  0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
1207  0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
1208  0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
1209  0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
1210  0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
1211  0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
1212  0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1213  0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1214  1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1215  1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1216  1.000000f,
1217 };
1218 
1219 static inline float tansig_approx(float x)
1220 {
1221  float y, dy;
1222  float sign=1;
1223  int i;
1224 
1225  /* Tests are reversed to catch NaNs */
1226  if (!(x<8))
1227  return 1;
1228  if (!(x>-8))
1229  return -1;
1230  /* Another check in case of -ffast-math */
1231 
1232  if (isnan(x))
1233  return 0;
1234 
1235  if (x < 0) {
1236  x=-x;
1237  sign=-1;
1238  }
1239  i = (int)floor(.5f+25*x);
1240  x -= .04f*i;
1241  y = tansig_table[i];
1242  dy = 1-y*y;
1243  y = y + x*dy*(1 - y*x);
1244  return sign*y;
1245 }
1246 
1247 static inline float sigmoid_approx(float x)
1248 {
1249  return .5f + .5f*tansig_approx(.5f*x);
1250 }
1251 
1252 static void compute_dense(const DenseLayer *layer, float *output, const float *input)
1253 {
1254  const int N = layer->nb_neurons, M = layer->nb_inputs, stride = N;
1255 
1256  for (int i = 0; i < N; i++) {
1257  /* Compute update gate. */
1258  float sum = layer->bias[i];
1259 
1260  for (int j = 0; j < M; j++)
1261  sum += layer->input_weights[j * stride + i] * input[j];
1262 
1263  output[i] = WEIGHTS_SCALE * sum;
1264  }
1265 
1266  if (layer->activation == ACTIVATION_SIGMOID) {
1267  for (int i = 0; i < N; i++)
1269  } else if (layer->activation == ACTIVATION_TANH) {
1270  for (int i = 0; i < N; i++)
1271  output[i] = tansig_approx(output[i]);
1272  } else if (layer->activation == ACTIVATION_RELU) {
1273  for (int i = 0; i < N; i++)
1274  output[i] = FFMAX(0, output[i]);
1275  } else {
1276  av_assert0(0);
1277  }
1278 }
1279 
1280 static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
1281 {
1282  LOCAL_ALIGNED_32(float, z, [MAX_NEURONS]);
1283  LOCAL_ALIGNED_32(float, r, [MAX_NEURONS]);
1284  LOCAL_ALIGNED_32(float, h, [MAX_NEURONS]);
1285  const int M = gru->nb_inputs;
1286  const int N = gru->nb_neurons;
1287  const int AN = FFALIGN(N, 4);
1288  const int AM = FFALIGN(M, 4);
1289  const int stride = 3 * AN, istride = 3 * AM;
1290 
1291  for (int i = 0; i < N; i++) {
1292  /* Compute update gate. */
1293  float sum = gru->bias[i];
1294 
1295  sum += s->fdsp->scalarproduct_float(gru->input_weights + i * istride, input, AM);
1296  sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + i * stride, state, AN);
1297  z[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1298  }
1299 
1300  for (int i = 0; i < N; i++) {
1301  /* Compute reset gate. */
1302  float sum = gru->bias[N + i];
1303 
1304  sum += s->fdsp->scalarproduct_float(gru->input_weights + AM + i * istride, input, AM);
1305  sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + AN + i * stride, state, AN);
1306  r[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1307  }
1308 
1309  for (int i = 0; i < N; i++) {
1310  /* Compute output. */
1311  float sum = gru->bias[2 * N + i];
1312 
1313  sum += s->fdsp->scalarproduct_float(gru->input_weights + 2 * AM + i * istride, input, AM);
1314  for (int j = 0; j < N; j++)
1315  sum += gru->recurrent_weights[2 * AN + i * stride + j] * state[j] * r[j];
1316 
1317  if (gru->activation == ACTIVATION_SIGMOID)
1318  sum = sigmoid_approx(WEIGHTS_SCALE * sum);
1319  else if (gru->activation == ACTIVATION_TANH)
1320  sum = tansig_approx(WEIGHTS_SCALE * sum);
1321  else if (gru->activation == ACTIVATION_RELU)
1322  sum = FFMAX(0, WEIGHTS_SCALE * sum);
1323  else
1324  av_assert0(0);
1325  h[i] = z[i] * state[i] + (1.f - z[i]) * sum;
1326  }
1327 
1328  RNN_COPY(state, h, N);
1329 }
1330 
1331 #define INPUT_SIZE 42
1332 
1333 static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
1334 {
1335  LOCAL_ALIGNED_32(float, dense_out, [MAX_NEURONS]);
1336  LOCAL_ALIGNED_32(float, noise_input, [MAX_NEURONS * 3]);
1337  LOCAL_ALIGNED_32(float, denoise_input, [MAX_NEURONS * 3]);
1338 
1339  compute_dense(rnn->model->input_dense, dense_out, input);
1340  compute_gru(s, rnn->model->vad_gru, rnn->vad_gru_state, dense_out);
1341  compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state);
1342 
1343  memcpy(noise_input, dense_out, rnn->model->input_dense_size * sizeof(float));
1344  memcpy(noise_input + rnn->model->input_dense_size,
1345  rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1346  memcpy(noise_input + rnn->model->input_dense_size + rnn->model->vad_gru_size,
1347  input, INPUT_SIZE * sizeof(float));
1348 
1349  compute_gru(s, rnn->model->noise_gru, rnn->noise_gru_state, noise_input);
1350 
1351  memcpy(denoise_input, rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1352  memcpy(denoise_input + rnn->model->vad_gru_size,
1353  rnn->noise_gru_state, rnn->model->noise_gru_size * sizeof(float));
1354  memcpy(denoise_input + rnn->model->vad_gru_size + rnn->model->noise_gru_size,
1355  input, INPUT_SIZE * sizeof(float));
1356 
1357  compute_gru(s, rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input);
1359 }
1360 
1361 static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,
1362  int disabled)
1363 {
1366  float x[FRAME_SIZE];
1367  float Ex[NB_BANDS], Ep[NB_BANDS];
1368  LOCAL_ALIGNED_32(float, Exp, [NB_BANDS]);
1369  float features[NB_FEATURES];
1370  float g[NB_BANDS];
1371  float gf[FREQ_SIZE];
1372  float vad_prob = 0;
1373  float *history = st->history;
1374  static const float a_hp[2] = {-1.99599, 0.99600};
1375  static const float b_hp[2] = {-2, 1};
1376  int silence;
1377 
1378  biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
1379  silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);
1380 
1381  if (!silence && !disabled) {
1382  compute_rnn(s, &st->rnn[0], g, &vad_prob, features);
1383  pitch_filter(X, P, Ex, Ep, Exp, g);
1384  for (int i = 0; i < NB_BANDS; i++) {
1385  float alpha = .6f;
1386 
1387  g[i] = FFMAX(g[i], alpha * st->lastg[i]);
1388  st->lastg[i] = g[i];
1389  }
1390 
1391  interp_band_gain(gf, g);
1392 
1393  for (int i = 0; i < FREQ_SIZE; i++) {
1394  X[i].re *= gf[i];
1395  X[i].im *= gf[i];
1396  }
1397  }
1398 
1399  frame_synthesis(s, st, out, X);
1400  memcpy(history, in, FRAME_SIZE * sizeof(*history));
1401 
1402  return vad_prob;
1403 }
1404 
1405 typedef struct ThreadData {
1406  AVFrame *in, *out;
1407 } ThreadData;
1408 
1409 static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
1410 {
1411  AudioRNNContext *s = ctx->priv;
1412  ThreadData *td = arg;
1413  AVFrame *in = td->in;
1414  AVFrame *out = td->out;
1415  const int start = (out->ch_layout.nb_channels * jobnr) / nb_jobs;
1416  const int end = (out->ch_layout.nb_channels * (jobnr+1)) / nb_jobs;
1417 
1418  for (int ch = start; ch < end; ch++) {
1419  rnnoise_channel(s, &s->st[ch],
1420  (float *)out->extended_data[ch],
1421  (const float *)in->extended_data[ch],
1422  ctx->is_disabled);
1423  }
1424 
1425  return 0;
1426 }
1427 
1429 {
1430  AVFilterContext *ctx = inlink->dst;
1431  AVFilterLink *outlink = ctx->outputs[0];
1432  AVFrame *out = NULL;
1433  ThreadData td;
1434 
1435  out = ff_get_audio_buffer(outlink, FRAME_SIZE);
1436  if (!out) {
1437  av_frame_free(&in);
1438  return AVERROR(ENOMEM);
1439  }
1440  av_frame_copy_props(out, in);
1441 
1442  td.in = in; td.out = out;
1445 
1446  av_frame_free(&in);
1447  return ff_filter_frame(outlink, out);
1448 }
1449 
1451 {
1452  AVFilterLink *inlink = ctx->inputs[0];
1453  AVFilterLink *outlink = ctx->outputs[0];
1454  AVFrame *in = NULL;
1455  int ret;
1456 
1458 
1460  if (ret < 0)
1461  return ret;
1462 
1463  if (ret > 0)
1464  return filter_frame(inlink, in);
1465 
1466  FF_FILTER_FORWARD_STATUS(inlink, outlink);
1467  FF_FILTER_FORWARD_WANTED(outlink, inlink);
1468 
1469  return FFERROR_NOT_READY;
1470 }
1471 
1473 {
1474  AudioRNNContext *s = ctx->priv;
1475  int ret;
1476  FILE *f;
1477 
1478  if (!s->model_name)
1479  return AVERROR(EINVAL);
1480  f = avpriv_fopen_utf8(s->model_name, "r");
1481  if (!f) {
1482  av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);
1483  return AVERROR(EINVAL);
1484  }
1485 
1486  ret = rnnoise_model_from_file(f, model);
1487  fclose(f);
1488  if (!*model || ret < 0)
1489  return ret;
1490 
1491  return 0;
1492 }
1493 
1495 {
1496  AudioRNNContext *s = ctx->priv;
1497  int ret;
1498 
1499  s->fdsp = avpriv_float_dsp_alloc(0);
1500  if (!s->fdsp)
1501  return AVERROR(ENOMEM);
1502 
1503  ret = open_model(ctx, &s->model[0]);
1504  if (ret < 0)
1505  return ret;
1506 
1507  for (int i = 0; i < FRAME_SIZE; i++) {
1508  s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
1509  s->window[WINDOW_SIZE - 1 - i] = s->window[i];
1510  }
1511 
1512  for (int i = 0; i < NB_BANDS; i++) {
1513  for (int j = 0; j < NB_BANDS; j++) {
1514  s->dct_table[j][i] = cosf((i + .5f) * j * M_PI / NB_BANDS);
1515  if (j == 0)
1516  s->dct_table[j][i] *= sqrtf(.5);
1517  }
1518  }
1519 
1520  return 0;
1521 }
1522 
1523 static void free_model(AVFilterContext *ctx, int n)
1524 {
1525  AudioRNNContext *s = ctx->priv;
1526 
1527  rnnoise_model_free(s->model[n]);
1528  s->model[n] = NULL;
1529 
1530  for (int ch = 0; ch < s->channels && s->st; ch++) {
1531  av_freep(&s->st[ch].rnn[n].vad_gru_state);
1532  av_freep(&s->st[ch].rnn[n].noise_gru_state);
1533  av_freep(&s->st[ch].rnn[n].denoise_gru_state);
1534  }
1535 }
1536 
1537 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
1538  char *res, int res_len, int flags)
1539 {
1540  AudioRNNContext *s = ctx->priv;
1541  int ret;
1542 
1543  ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
1544  if (ret < 0)
1545  return ret;
1546 
1547  ret = open_model(ctx, &s->model[1]);
1548  if (ret < 0)
1549  return ret;
1550 
1551  FFSWAP(RNNModel *, s->model[0], s->model[1]);
1552  for (int ch = 0; ch < s->channels; ch++)
1553  FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1554 
1555  ret = config_input(ctx->inputs[0]);
1556  if (ret < 0) {
1557  for (int ch = 0; ch < s->channels; ch++)
1558  FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1559  FFSWAP(RNNModel *, s->model[0], s->model[1]);
1560  return ret;
1561  }
1562 
1563  free_model(ctx, 1);
1564  return 0;
1565 }
1566 
1568 {
1569  AudioRNNContext *s = ctx->priv;
1570 
1571  av_freep(&s->fdsp);
1572  free_model(ctx, 0);
1573  for (int ch = 0; ch < s->channels && s->st; ch++) {
1574  av_tx_uninit(&s->st[ch].tx);
1575  av_tx_uninit(&s->st[ch].txi);
1576  }
1577  av_freep(&s->st);
1578 }
1579 
1580 static const AVFilterPad inputs[] = {
1581  {
1582  .name = "default",
1583  .type = AVMEDIA_TYPE_AUDIO,
1584  .config_props = config_input,
1585  },
1586 };
1587 
1588 #define OFFSET(x) offsetof(AudioRNNContext, x)
1589 #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
1590 
1591 static const AVOption arnndn_options[] = {
1592  { "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1593  { "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1594  { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },
1595  { NULL }
1596 };
1597 
1598 AVFILTER_DEFINE_CLASS(arnndn);
1599 
1601  .name = "arnndn",
1602  .description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),
1603  .priv_size = sizeof(AudioRNNContext),
1604  .priv_class = &arnndn_class,
1605  .activate = activate,
1606  .init = init,
1607  .uninit = uninit,
1613  .process_command = process_command,
1614 };
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:31
M
#define M(a, b)
Definition: vp3dsp.c:48
compute_dense
static void compute_dense(const DenseLayer *layer, float *output, const float *input)
Definition: af_arnndn.c:1252
ff_get_audio_buffer
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:107
AV_SAMPLE_FMT_FLTP
@ AV_SAMPLE_FMT_FLTP
float, planar
Definition: samplefmt.h:66
PITCH_MAX_PERIOD
#define PITCH_MAX_PERIOD
Definition: af_arnndn.c:51
td
#define td
Definition: regdef.h:70
pitch_downsample
static void pitch_downsample(float *x[], float *x_lp, int len, int C)
Definition: af_arnndn.c:742
WEIGHTS_SCALE
#define WEIGHTS_SCALE
Definition: af_arnndn.c:64
mix
static int mix(int c0, int c1)
Definition: 4xm.c:717
DenoiseState::synthesis_mem
float synthesis_mem[FRAME_SIZE]
Definition: af_arnndn.c:122
r
const char * r
Definition: vf_curves.c:126
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
activate
static int activate(AVFilterContext *ctx)
Definition: af_arnndn.c:1450
mem_internal.h
GRULayer::activation
int activation
Definition: af_arnndn.c:88
out
FILE * out
Definition: movenc.c:54
dual_inner_prod
static void dual_inner_prod(const float *x, const float *y01, const float *y02, int N, float *xy1, float *xy2)
Definition: af_arnndn.c:784
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:978
sample_fmts
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:947
FFERROR_NOT_READY
return FFERROR_NOT_READY
Definition: filter_design.txt:204
FREE_GRU
#define FREE_GRU(name)
AVTXContext
Definition: tx_priv.h:235
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
ff_set_common_samplerates_from_list
int ff_set_common_samplerates_from_list(AVFilterContext *ctx, const int *samplerates)
Equivalent to ff_set_common_samplerates(ctx, ff_make_format_list(samplerates))
Definition: formats.c:754
PITCH_MIN_PERIOD
#define PITCH_MIN_PERIOD
Definition: af_arnndn.c:50
av_frame_free
void av_frame_free(AVFrame **frame)
Free the frame and any dynamically allocated objects in it, e.g.
Definition: frame.c:100
GRULayer::nb_neurons
int nb_neurons
Definition: af_arnndn.c:87
RNNState::noise_gru_state
float * noise_gru_state
Definition: af_arnndn.c:113
uninit
static av_cold void uninit(AVFilterContext *ctx)
Definition: af_arnndn.c:1567
inverse_transform
static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
Definition: af_arnndn.c:425
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:340
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
DenoiseState::lastg
float lastg[NB_BANDS]
Definition: af_arnndn.c:128
AVOption
AVOption.
Definition: opt.h:251
OFFSET
#define OFFSET(x)
Definition: af_arnndn.c:1588
b
#define b
Definition: input.c:41
arnndn_options
static const AVOption arnndn_options[]
Definition: af_arnndn.c:1591
FILTER_QUERY_FUNC
#define FILTER_QUERY_FUNC(func)
Definition: internal.h:169
frame_synthesis
static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
Definition: af_arnndn.c:510
NB_DELTA_CEPS
#define NB_DELTA_CEPS
Definition: af_arnndn.c:60
RNNModel::input_dense_size
int input_dense_size
Definition: af_arnndn.c:92
AVComplexFloat
Definition: tx.h:27
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:170
c1
static const uint64_t c1
Definition: murmur3.c:52
ThreadData::out
AVFrame * out
Definition: af_adeclick.c:526
AVChannelLayout::nb_channels
int nb_channels
Number of channels in this layout.
Definition: channel_layout.h:317
ThreadData::in
AVFrame * in
Definition: af_adecorrelate.c:153
tansig_table
static const float tansig_table[201]
Definition: af_arnndn.c:1175
find_best_pitch
static void find_best_pitch(float *xcorr, float *y, int len, int max_pitch, int *best_pitch)
Definition: af_arnndn.c:907
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:199
process_command
static int process_command(AVFilterContext *ctx, const char *cmd, const char *args, char *res, int res_len, int flags)
Definition: af_arnndn.c:1537
av_tx_init
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags)
Initialize a transform context with the given configuration (i)MDCTs with an odd length are currently...
Definition: tx.c:901
DenoiseState::memid
int memid
Definition: af_arnndn.c:121
RNN_CLEAR
#define RNN_CLEAR(dst, n)
Definition: af_arnndn.c:407
GRULayer::nb_inputs
int nb_inputs
Definition: af_arnndn.c:86
compute_band_energy
static void compute_band_energy(float *bandE, const AVComplexFloat *X)
Definition: af_arnndn.c:448
formats.h
compute_rnn
static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
Definition: af_arnndn.c:1333
DenoiseState::txi
AVTXContext * txi
Definition: af_arnndn.c:131
free_model
static void free_model(AVFilterContext *ctx, int n)
Definition: af_arnndn.c:1523
RNNState::denoise_gru_state
float * denoise_gru_state
Definition: af_arnndn.c:114
rnnoise_channels
static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
Definition: af_arnndn.c:1409
ACTIVATION_RELU
#define ACTIVATION_RELU
Definition: af_arnndn.c:70
AVComplexFloat::im
float im
Definition: tx.h:28
DenoiseState::mem_hp_x
float mem_hp_x[2]
Definition: af_arnndn.c:127
window
static SDL_Window * window
Definition: ffplay.c:360
cosf
#define cosf(x)
Definition: libm.h:78
log10f
#define log10f(x)
Definition: libm.h:414
AudioRNNContext::model
RNNModel * model[2]
Definition: af_arnndn.c:147
rnnoise_model_free
static void rnnoise_model_free(RNNModel *model)
Definition: af_arnndn.c:156
AudioRNNContext::st
DenoiseState * st
Definition: af_arnndn.c:142
DenoiseState::cepstral_mem
float cepstral_mem[CEPS_MEM][NB_BANDS]
Definition: af_arnndn.c:120
scale
static av_always_inline float scale(float x, float s)
Definition: vf_v360.c:1389
SQUARE
#define SQUARE(x)
Definition: af_arnndn.c:55
AF
#define AF
Definition: af_arnndn.c:1589
DenseLayer::bias
const float * bias
Definition: af_arnndn.c:75
AVFilterPad
A filter pad used for either input or output.
Definition: internal.h:47
FREQ_SIZE
#define FREQ_SIZE
Definition: af_arnndn.c:48
T
#define T(x)
Definition: vpx_arith.h:29
compute_band_corr
static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
Definition: af_arnndn.c:473
DenoiseState::history
float history[FRAME_SIZE]
Definition: af_arnndn.c:129
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
avassert.h
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:180
av_cold
#define av_cold
Definition: attributes.h:90
av_tx_fn
void(* av_tx_fn)(AVTXContext *s, void *out, void *in, ptrdiff_t stride)
Function pointer to a function to perform the transform.
Definition: tx.h:151
float
float
Definition: af_crystalizer.c:121
MAX_NEURONS
#define MAX_NEURONS
Definition: af_arnndn.c:66
X
@ X
Definition: vf_addroi.c:27
s
#define s(width, name)
Definition: cbs_vp9.c:198
frame_analysis
static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
Definition: af_arnndn.c:498
DenseLayer::nb_inputs
int nb_inputs
Definition: af_arnndn.c:77
CEPS_MEM
#define CEPS_MEM
Definition: af_arnndn.c:59
floor
static __device__ float floor(float a)
Definition: cuda_runtime.h:173
inputs
static const AVFilterPad inputs[]
Definition: af_arnndn.c:1580
g
const char * g
Definition: vf_curves.c:127
celt_inner_prod
static float celt_inner_prod(const float *x, const float *y, int N)
Definition: af_arnndn.c:596
AVMEDIA_TYPE_AUDIO
@ AVMEDIA_TYPE_AUDIO
Definition: avutil.h:202
ff_set_common_formats_from_list
int ff_set_common_formats_from_list(AVFilterContext *ctx, const int *fmts)
Equivalent to ff_set_common_formats(ctx, ff_make_format_list(fmts))
Definition: formats.c:776
av_assert0
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:40
filters.h
AV_TX_FLOAT_FFT
@ AV_TX_FLOAT_FFT
Standard complex to complex FFT with sample data type of AVComplexFloat, AVComplexDouble or AVComplex...
Definition: tx.h:47
ctx
AVFormatContext * ctx
Definition: movenc.c:48
RNNModel::vad_gru_size
int vad_gru_size
Definition: af_arnndn.c:95
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h2645.c:417
rnnoise_model_from_file
static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
Definition: af_arnndn.c:186
ff_af_arnndn
const AVFilter ff_af_arnndn
Definition: af_arnndn.c:1600
config_input
static int config_input(AVFilterLink *inlink)
Definition: af_arnndn.c:348
FRAME_SIZE_SHIFT
#define FRAME_SIZE_SHIFT
Definition: af_arnndn.c:45
ACTIVATION_TANH
#define ACTIVATION_TANH
Definition: af_arnndn.c:68
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: internal.h:192
file_open.h
E
#define E
Definition: avdct.c:32
arg
const char * arg
Definition: jacosubdec.c:67
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:65
if
if(ret)
Definition: filter_design.txt:179
RNNModel::vad_gru
const GRULayer * vad_gru
Definition: af_arnndn.c:96
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:66
ff_inlink_consume_samples
int ff_inlink_consume_samples(AVFilterLink *link, unsigned min, unsigned max, AVFrame **rframe)
Take samples from the link's FIFO and update the link's stats.
Definition: avfilter.c:1402
NULL
#define NULL
Definition: coverity.c:32
LOCAL_ALIGNED_32
#define LOCAL_ALIGNED_32(t, v,...)
Definition: mem_internal.h:135
av_frame_copy_props
int av_frame_copy_props(AVFrame *dst, const AVFrame *src)
Copy only "metadata" fields from src to dst.
Definition: frame.c:736
sigmoid_approx
static float sigmoid_approx(float x)
Definition: af_arnndn.c:1247
RNNModel::denoise_gru_size
int denoise_gru_size
Definition: af_arnndn.c:101
RNNModel::vad_output
const DenseLayer * vad_output
Definition: af_arnndn.c:108
isnan
#define isnan(x)
Definition: libm.h:340
GRULayer::recurrent_weights
const float * recurrent_weights
Definition: af_arnndn.c:85
FREE_DENSE
#define FREE_DENSE(name)
PITCH_BUF_SIZE
#define PITCH_BUF_SIZE
Definition: af_arnndn.c:53
ff_audio_default_filterpad
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
Definition: audio.c:32
sqrtf
static __device__ float sqrtf(float a)
Definition: cuda_runtime.h:184
PITCH_FRAME_SIZE
#define PITCH_FRAME_SIZE
Definition: af_arnndn.c:52
av_clipf
av_clipf
Definition: af_crystalizer.c:121
ff_set_common_all_channel_counts
int ff_set_common_all_channel_counts(AVFilterContext *ctx)
Equivalent to ff_set_common_channel_layouts(ctx, ff_all_channel_counts())
Definition: formats.c:742
RNNModel::input_dense
const DenseLayer * input_dense
Definition: af_arnndn.c:93
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
DenseLayer::input_weights
const float * input_weights
Definition: af_arnndn.c:76
float_dsp.h
biquad
static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N)
Definition: af_arnndn.c:392
DenoiseState::pitch_buf
float pitch_buf[PITCH_BUF_SIZE]
Definition: af_arnndn.c:123
f
f
Definition: af_crystalizer.c:121
INPUT_SIZE
#define INPUT_SIZE
Definition: af_arnndn.c:1331
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:106
NB_BANDS
#define NB_BANDS
Definition: af_arnndn.c:57
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:87
P
#define P
shift
static int shift(int a, int b)
Definition: bonk.c:262
DenseLayer::nb_neurons
int nb_neurons
Definition: af_arnndn.c:78
AV_SAMPLE_FMT_NONE
@ AV_SAMPLE_FMT_NONE
Definition: samplefmt.h:56
celt_autocorr
static int celt_autocorr(const float *x, float *ac, const float *window, int overlap, int lag, int n)
Definition: af_arnndn.c:628
WINDOW_SIZE
#define WINDOW_SIZE
Definition: af_arnndn.c:47
AVComplexFloat::re
float re
Definition: tx.h:28
AudioRNNContext::mix
float mix
Definition: af_arnndn.c:139
AVFloatDSPContext
Definition: float_dsp.h:24
RNNModel::noise_gru_size
int noise_gru_size
Definition: af_arnndn.c:98
celt_lpc
static void celt_lpc(float *lpc, const float *ac, int p)
Definition: af_arnndn.c:666
ff_filter_process_command
int ff_filter_process_command(AVFilterContext *ctx, const char *cmd, const char *arg, char *res, int res_len, int flags)
Generic processing of user supplied commands that are set in the same way as the filter options.
Definition: avfilter.c:851
DenoiseState::rnn
RNNState rnn[2]
Definition: af_arnndn.c:130
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
RNN_MOVE
#define RNN_MOVE(dst, src, n)
Definition: af_arnndn.c:406
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
N
#define N
Definition: af_mcompand.c:53
RNNModel::denoise_gru
const GRULayer * denoise_gru
Definition: af_arnndn.c:102
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
DenoiseState::last_gain
float last_gain
Definition: af_arnndn.c:125
M_PI
#define M_PI
Definition: mathematics.h:67
av_tx_uninit
av_cold void av_tx_uninit(AVTXContext **ctx)
Frees a context and sets *ctx to NULL, does nothing when *ctx == NULL.
Definition: tx.c:294
AudioRNNContext::channels
int channels
Definition: af_arnndn.c:141
DenoiseState::tx
AVTXContext * tx
Definition: af_arnndn.c:131
sample_rates
sample_rates
Definition: ffmpeg_filter.c:368
ACTIVATION_SIGMOID
#define ACTIVATION_SIGMOID
Definition: af_arnndn.c:69
AudioRNNContext::model_name
char * model_name
Definition: af_arnndn.c:138
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Definition: opt.h:228
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
DenoiseState
Definition: af_arnndn.c:118
RNN_COPY
#define RNN_COPY(dst, src, n)
Definition: af_arnndn.c:408
AVFrame::extended_data
uint8_t ** extended_data
pointers to the data planes/channels.
Definition: frame.h:401
ff_filter_get_nb_threads
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
Definition: avfilter.c:786
AVSampleFormat
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:55
ThreadData
Used for passing data between threads.
Definition: dsddec.c:69
interp_band_gain
static void interp_band_gain(float *g, const float *bandE)
Definition: af_arnndn.c:1129
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
dct
static void dct(AudioRNNContext *s, float *out, const float *in)
Definition: af_arnndn.c:1011
AudioRNNContext
Definition: af_arnndn.c:135
FRAME_SIZE
#define FRAME_SIZE
Definition: af_arnndn.c:46
len
int len
Definition: vorbis_enc_data.h:426
AudioRNNContext::dct_table
float dct_table[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)]
Definition: af_arnndn.c:145
AVFilterPad::name
const char * name
Pad name.
Definition: internal.h:53
avpriv_fopen_utf8
FILE * avpriv_fopen_utf8(const char *path, const char *mode)
Open a file using a UTF-8 filename.
Definition: file_open.c:159
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:262
stride
#define stride
Definition: h264pred_template.c:537
AVFilter
Filter definition.
Definition: avfilter.h:166
open_model
static int open_model(AVFilterContext *ctx, RNNModel **model)
Definition: af_arnndn.c:1472
ret
ret
Definition: filter_design.txt:187
RNNModel
Definition: af_arnndn.c:91
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
compute_frame_features
static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P, float *Ex, float *Ep, float *Exp, float *features, const float *in)
Definition: af_arnndn.c:1021
DenseLayer
Definition: af_arnndn.c:74
GRULayer::input_weights
const float * input_weights
Definition: af_arnndn.c:84
AudioRNNContext::window
float window[WINDOW_SIZE]
Definition: af_arnndn.c:144
second_check
static const uint8_t second_check[16]
Definition: af_arnndn.c:803
remove_doubling
static float remove_doubling(float *x, int maxperiod, int minperiod, int N, int *T0_, int prev_period, float prev_gain)
Definition: af_arnndn.c:804
RNNModel::denoise_output_size
int denoise_output_size
Definition: af_arnndn.c:104
compute_pitch_gain
static float compute_pitch_gain(float xy, float xx, float yy)
Definition: af_arnndn.c:798
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(arnndn)
xcorr_kernel
static void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
Definition: af_arnndn.c:527
RNNModel::vad_output_size
int vad_output_size
Definition: af_arnndn.c:107
pitch_search
static void pitch_search(const float *x_lp, float *y, int len, int max_pitch, int *pitch)
Definition: af_arnndn.c:954
pitch_filter
static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep, const float *Exp, const float *g)
Definition: af_arnndn.c:1144
avfilter.h
celt_pitch_xcorr
static void celt_pitch_xcorr(const float *x, const float *y, float *xcorr, int len, int max_pitch)
Definition: af_arnndn.c:607
RNNState::vad_gru_state
float * vad_gru_state
Definition: af_arnndn.c:112
INPUT_GRU
#define INPUT_GRU(name)
rnnoise_channel
static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in, int disabled)
Definition: af_arnndn.c:1361
celt_fir5
static void celt_fir5(const float *x, const float *num, float *y, int N, float *mem)
Definition: af_arnndn.c:699
state
static struct @362 state
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
Definition: af_arnndn.c:1428
AVFilterContext
An instance of a filter.
Definition: avfilter.h:397
DenoiseState::pitch_enh_buf
float pitch_enh_buf[PITCH_BUF_SIZE]
Definition: af_arnndn.c:124
AVFILTER_FLAG_SLICE_THREADS
#define AVFILTER_FLAG_SLICE_THREADS
The filter supports multithreading by splitting frames into multiple parts and processing them concur...
Definition: avfilter.h:117
tansig_approx
static float tansig_approx(float x)
Definition: af_arnndn.c:1219
AudioRNNContext::fdsp
AVFloatDSPContext * fdsp
Definition: af_arnndn.c:149
Q15ONE
#define Q15ONE
Definition: af_arnndn.c:72
DenoiseState::last_period
int last_period
Definition: af_arnndn.c:126
audio.h
DenoiseState::tx_fn
av_tx_fn tx_fn
Definition: af_arnndn.c:132
query_formats
static int query_formats(AVFilterContext *ctx)
Definition: af_arnndn.c:329
forward_transform
static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
Definition: af_arnndn.c:410
av_free
#define av_free(p)
Definition: tableprint_vlc.h:33
FF_FILTER_FORWARD_STATUS
FF_FILTER_FORWARD_STATUS(inlink, outlink)
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: internal.h:193
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:34
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
avpriv_float_dsp_alloc
av_cold AVFloatDSPContext * avpriv_float_dsp_alloc(int bit_exact)
Allocate a float DSP context.
Definition: float_dsp.c:135
DenoiseState::txi_fn
av_tx_fn txi_fn
Definition: af_arnndn.c:132
d
d
Definition: ffmpeg_filter.c:368
AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL
#define AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL
Same as AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, except that the filter will have its filter_frame() c...
Definition: avfilter.h:155
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:474
DenseLayer::activation
int activation
Definition: af_arnndn.c:79
RNNModel::denoise_output
const DenseLayer * denoise_output
Definition: af_arnndn.c:105
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
AVERROR_INVALIDDATA
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:61
h
h
Definition: vp9dsp_template.c:2038
RNNState
Definition: af_arnndn.c:111
ALLOC_LAYER
#define ALLOC_LAYER(type, name)
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Definition: opt.h:229
GRULayer
Definition: af_arnndn.c:82
ff_filter_execute
static av_always_inline int ff_filter_execute(AVFilterContext *ctx, avfilter_action_func *func, void *arg, int *ret, int nb_jobs)
Definition: internal.h:144
int
int
Definition: ffmpeg_filter.c:368
compute_gru
static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
Definition: af_arnndn.c:1280
eband5ms
static const uint8_t eband5ms[]
Definition: af_arnndn.c:443
GRULayer::bias
const float * bias
Definition: af_arnndn.c:83
INPUT_DENSE
#define INPUT_DENSE(name)
RNNModel::noise_gru
const GRULayer * noise_gru
Definition: af_arnndn.c:99
NB_FEATURES
#define NB_FEATURES
Definition: af_arnndn.c:62
init
static av_cold int init(AVFilterContext *ctx)
Definition: af_arnndn.c:1494
tx.h
RNNState::model
RNNModel * model
Definition: af_arnndn.c:115
DenoiseState::analysis_mem
float analysis_mem[FRAME_SIZE]
Definition: af_arnndn.c:119