47 #define FRAME_SIZE_SHIFT 2
48 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
49 #define WINDOW_SIZE (2*FRAME_SIZE)
50 #define FREQ_SIZE (FRAME_SIZE + 1)
52 #define PITCH_MIN_PERIOD 60
53 #define PITCH_MAX_PERIOD 768
54 #define PITCH_FRAME_SIZE 960
55 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
57 #define SQUARE(x) ((x)*(x))
62 #define NB_DELTA_CEPS 6
64 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
66 #define WEIGHTS_SCALE (1.f/256)
68 #define MAX_NEURONS 128
70 #define ACTIVATION_TANH 0
71 #define ACTIVATION_SIGMOID 1
72 #define ACTIVATION_RELU 2
154 #define F_ACTIVATION_TANH 0
155 #define F_ACTIVATION_SIGMOID 1
156 #define F_ACTIVATION_RELU 2
160 #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)
161 #define FREE_DENSE(name) do { \
163 av_free((void *) model->name->input_weights); \
164 av_free((void *) model->name->bias); \
165 av_free((void *) model->name); \
168 #define FREE_GRU(name) do { \
170 av_free((void *) model->name->input_weights); \
171 av_free((void *) model->name->recurrent_weights); \
172 av_free((void *) model->name->bias); \
173 av_free((void *) model->name); \
199 if (fscanf(
f,
"rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
206 #define ALLOC_LAYER(type, name) \
207 name = av_calloc(1, sizeof(type)); \
209 rnnoise_model_free(ret); \
210 return AVERROR(ENOMEM); \
221 #define INPUT_VAL(name) do { \
222 if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
223 rnnoise_model_free(ret); \
224 return AVERROR(EINVAL); \
229 #define INPUT_ACTIVATION(name) do { \
231 INPUT_VAL(activation); \
232 switch (activation) { \
233 case F_ACTIVATION_SIGMOID: \
234 name = ACTIVATION_SIGMOID; \
236 case F_ACTIVATION_RELU: \
237 name = ACTIVATION_RELU; \
240 name = ACTIVATION_TANH; \
244 #define INPUT_ARRAY(name, len) do { \
245 float *values = av_calloc((len), sizeof(float)); \
247 rnnoise_model_free(ret); \
248 return AVERROR(ENOMEM); \
251 for (int i = 0; i < (len); i++) { \
252 if (fscanf(f, "%d", &in) != 1) { \
253 rnnoise_model_free(ret); \
254 return AVERROR(EINVAL); \
260 #define INPUT_ARRAY3(name, len0, len1, len2) do { \
261 float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
263 rnnoise_model_free(ret); \
264 return AVERROR(ENOMEM); \
267 for (int k = 0; k < (len0); k++) { \
268 for (int i = 0; i < (len2); i++) { \
269 for (int j = 0; j < (len1); j++) { \
270 if (fscanf(f, "%d", &in) != 1) { \
271 rnnoise_model_free(ret); \
272 return AVERROR(EINVAL); \
274 values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
280 #define NEW_LINE() do { \
282 while ((c = fgetc(f)) != EOF) { \
288 #define INPUT_DENSE(name) do { \
289 INPUT_VAL(name->nb_inputs); \
290 INPUT_VAL(name->nb_neurons); \
291 ret->name ## _size = name->nb_neurons; \
292 INPUT_ACTIVATION(name->activation); \
294 INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
296 INPUT_ARRAY(name->bias, name->nb_neurons); \
300 #define INPUT_GRU(name) do { \
301 INPUT_VAL(name->nb_inputs); \
302 INPUT_VAL(name->nb_neurons); \
303 ret->name ## _size = name->nb_neurons; \
304 INPUT_ACTIVATION(name->activation); \
306 INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
308 INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
310 INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
356 s->channels =
inlink->channels;
363 for (
int i = 0;
i <
s->channels;
i++) {
376 for (
int i = 0;
i <
s->channels;
i++) {
393 static void biquad(
float *y,
float mem[2],
const float *x,
394 const float *
b,
const float *
a,
int N)
396 for (
int i = 0;
i <
N;
i++) {
401 mem[0] = mem[1] + (
b[0]*
xi -
a[0]*yi);
402 mem[1] = (
b[1]*
xi -
a[1]*yi);
407 #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
408 #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
409 #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
421 st->
tx_fn(st->
tx, y, x,
sizeof(
float));
438 st->
txi_fn(st->
txi, y, x,
sizeof(
float));
446 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
457 for (
int j = 0; j < band_size; j++) {
458 float tmp, frac = (float)j / band_size;
462 sum[
i] += (1.f - frac) *
tmp;
463 sum[
i + 1] += frac *
tmp;
482 for (
int j = 0; j < band_size; j++) {
483 float tmp, frac = (float)j / band_size;
487 sum[
i] += (1 - frac) *
tmp;
488 sum[
i + 1] += frac *
tmp;
515 const float mix =
s->mix;
516 const float imix = 1.f -
FFMAX(
mix, 0.
f);
528 static inline void xcorr_kernel(
const float *x,
const float *y,
float sum[4],
int len)
530 float y_0, y_1, y_2, y_3 = 0;
537 for (j = 0; j <
len - 3; j += 4) {
598 const float *y,
int N)
602 for (
int i = 0;
i <
N;
i++)
609 float *xcorr,
int len,
int max_pitch)
613 for (
i = 0;
i < max_pitch - 3;
i += 4) {
614 float sum[4] = { 0, 0, 0, 0};
619 xcorr[
i + 1] = sum[1];
620 xcorr[
i + 2] = sum[2];
621 xcorr[
i + 3] = sum[3];
624 for (;
i < max_pitch;
i++) {
644 for (
int i = 0;
i < n;
i++)
646 for (
int i = 0;
i < overlap;
i++) {
656 for (
int k = 0; k <= lag; k++) {
659 for (
int i = k + fastN;
i < n;
i++)
660 d += xptr[
i] * xptr[
i-k];
675 for (
int i = 0;
i < p;
i++) {
678 for (
int j = 0; j <
i; j++)
679 rr += (lpc[j] * ac[
i - j]);
684 for (
int j = 0; j < (
i + 1) >> 1; j++) {
688 lpc[j] = tmp1 + (
r*tmp2);
689 lpc[
i-1-j] = tmp2 + (
r*tmp1);
694 if (
error < .001
f * ac[0])
706 float num0, num1, num2, num3, num4;
707 float mem0, mem1, mem2, mem3, mem4;
720 for (
int i = 0;
i <
N;
i++) {
748 float lpc[4], mem[5]={0,0,0,0,0};
752 for (
int i = 1; i < len >> 1;
i++)
753 x_lp[
i] = .5
f * (.5
f * (x[0][(2*
i-1)]+x[0][(2*
i+1)])+x[0][2*
i]);
754 x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);
756 for (
int i = 1; i < len >> 1;
i++)
757 x_lp[
i] += (.5
f * (.5
f * (x[1][(2*
i-1)]+x[1][(2*
i+1)])+x[1][2*
i]));
758 x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);
766 for (
int i = 1;
i <= 4;
i++) {
768 ac[
i] -= ac[
i]*(.008f*
i)*(.008
f*
i);
772 for (
int i = 0;
i < 4;
i++) {
774 lpc[
i] = (lpc[
i] *
tmp);
777 lpc2[0] = lpc[0] + .8f;
778 lpc2[1] = lpc[1] + (
c1 * lpc[0]);
779 lpc2[2] = lpc[2] + (
c1 * lpc[1]);
780 lpc2[3] = lpc[3] + (
c1 * lpc[2]);
781 lpc2[4] = (
c1 * lpc[3]);
785 static inline void dual_inner_prod(
const float *x,
const float *y01,
const float *y02,
786 int N,
float *xy1,
float *xy2)
788 float xy01 = 0, xy02 = 0;
790 for (
int i = 0;
i <
N;
i++) {
791 xy01 += (x[
i] * y01[
i]);
792 xy02 += (x[
i] * y02[
i]);
801 return xy / sqrtf(1.
f + xx * yy);
804 static const uint8_t
second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
806 int *T0_,
int prev_period,
float prev_gain)
813 float best_xy, best_yy;
818 minperiod0 = minperiod;
832 for (
i = 1;
i <= maxperiod;
i++) {
833 yy = yy+(x[-
i] * x[-
i])-(x[
N-
i] * x[
N-
i]);
834 yy_lookup[
i] =
FFMAX(0, yy);
841 for (k = 2; k <= 15; k++) {
861 xy = .5f * (xy + xy2);
862 yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);
864 if (
FFABS(T1-prev_period)<=1)
866 else if (
FFABS(T1-prev_period)<=2 && 5 * k * k < T0)
867 cont = prev_gain * .5f;
870 thresh =
FFMAX(.3
f, (.7
f * g0) - cont);
874 thresh =
FFMAX(.4
f, (.85
f * g0) - cont);
875 else if (T1<2*minperiod)
876 thresh =
FFMAX(.5
f, (.9
f * g0) - cont);
885 best_xy =
FFMAX(0, best_xy);
886 if (best_yy <= best_xy)
889 pg = best_xy/(best_yy + 1);
891 for (k = 0; k < 3; k++)
893 if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))
895 else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))
909 int max_pitch,
int *best_pitch)
922 for (
int j = 0; j <
len; j++)
925 for (
int i = 0;
i < max_pitch;
i++) {
934 num = xcorr16 * xcorr16;
935 if ((num * best_den[1]) > (best_num[1] * Syy)) {
936 if ((num * best_den[0]) > (best_num[0] * Syy)) {
937 best_num[1] = best_num[0];
938 best_den[1] = best_den[0];
939 best_pitch[1] = best_pitch[0];
956 int len,
int max_pitch,
int *pitch)
959 int best_pitch[2]={0,0};
969 for (
int j = 0; j < len >> 2; j++)
970 x_lp4[j] = x_lp[2*j];
971 for (
int j = 0; j < lag >> 2; j++)
981 for (
int i = 0; i < max_pitch >> 1;
i++) {
984 if (
FFABS(
i-2*best_pitch[0])>2 &&
FFABS(
i-2*best_pitch[1])>2)
987 xcorr[
i] =
FFMAX(-1, sum);
993 if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {
996 a = xcorr[best_pitch[0] - 1];
997 b = xcorr[best_pitch[0]];
998 c = xcorr[best_pitch[0] + 1];
999 if (
c -
a > .7
f * (
b -
a))
1001 else if (
a -
c > .7
f * (
b-
c))
1009 *pitch = 2 * best_pitch[0] -
offset;
1018 out[
i] = sum * sqrtf(2.
f / 22);
1023 float *Ex,
float *Ep,
float *Exp,
float *features,
const float *in)
1026 float *ceps_0, *ceps_1, *ceps_2;
1027 float spec_variability = 0;
1035 float follow, logMax;
1060 Exp[
i] = Exp[
i] / sqrtf(.001
f+Ex[
i]*Ep[
i]);
1076 logMax =
FFMAX(logMax, Ly[
i]);
1077 follow =
FFMAX(follow-1.5, Ly[
i]);
1087 dct(
s, features, Ly);
1095 ceps_0[
i] = features[
i];
1099 features[
i] = ceps_0[
i] + ceps_1[
i] + ceps_2[
i];
1108 float mindist = 1e15f;
1109 for (
int j = 0; j <
CEPS_MEM; j++) {
1111 for (
int k = 0; k <
NB_BANDS; k++) {
1119 mindist =
FFMIN(mindist, dist);
1122 spec_variability += mindist;
1137 for (
int j = 0; j < band_size; j++) {
1138 float frac = (float)j / band_size;
1146 const float *Exp,
const float *
g)
1155 if (Exp[
i]>
g[
i])
r[
i] = 1;
1158 r[
i] *= sqrtf(Ex[
i]/(1e-8+Ep[
i]));
1162 X[
i].re += rf[
i]*
P[
i].re;
1163 X[
i].im += rf[
i]*
P[
i].im;
1167 norm[
i] = sqrtf(Ex[
i] / (1e-8+newE[
i]));
1171 X[
i].re *= normf[
i];
1172 X[
i].im *= normf[
i];
1177 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
1178 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
1179 0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
1180 0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
1181 0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
1182 0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
1183 0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
1184 0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
1185 0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
1186 0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
1187 0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
1188 0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
1189 0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
1190 0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
1191 0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
1192 0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
1193 0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
1194 0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
1195 0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
1196 0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
1197 0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
1198 0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
1199 0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
1200 0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
1201 0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
1202 0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
1203 0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
1204 0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
1205 0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
1206 0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
1207 0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
1208 0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
1209 0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
1210 0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
1211 0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
1212 0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
1213 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1214 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1215 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1216 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1244 y = y + x*dy*(1 - y*x);
1257 for (
int i = 0;
i <
N;
i++) {
1259 float sum = layer->
bias[
i];
1261 for (
int j = 0; j <
M; j++)
1268 for (
int i = 0;
i <
N;
i++)
1271 for (
int i = 0;
i <
N;
i++)
1274 for (
int i = 0;
i <
N;
i++)
1290 const int stride = 3 * AN, istride = 3 * AM;
1292 for (
int i = 0;
i <
N;
i++) {
1294 float sum = gru->
bias[
i];
1301 for (
int i = 0;
i <
N;
i++) {
1303 float sum = gru->
bias[
N +
i];
1310 for (
int i = 0;
i <
N;
i++) {
1312 float sum = gru->
bias[2 *
N +
i];
1315 for (
int j = 0; j <
N; j++)
1332 #define INPUT_SIZE 42
1375 static const float a_hp[2] = {-1.99599, 0.99600};
1376 static const float b_hp[2] = {-2, 1};
1382 if (!silence && !disabled) {
1401 memcpy(history, in,
FRAME_SIZE *
sizeof(*history));
1416 const int start = (
out->channels * jobnr) / nb_jobs;
1417 const int end = (
out->channels * (jobnr+1)) / nb_jobs;
1419 for (
int ch = start; ch < end; ch++) {
1421 (
float *)
out->extended_data[ch],
1489 if (!*model ||
ret < 0)
1514 for (
int j = 0; j <
NB_BANDS; j++) {
1517 s->dct_table[j][
i] *= sqrtf(.5);
1531 for (
int ch = 0; ch <
s->channels &&
s->st; ch++) {
1532 av_freep(&
s->st[ch].rnn[n].vad_gru_state);
1533 av_freep(&
s->st[ch].rnn[n].noise_gru_state);
1534 av_freep(&
s->st[ch].rnn[n].denoise_gru_state);
1539 char *res,
int res_len,
int flags)
1553 for (
int ch = 0; ch <
s->channels; ch++)
1558 for (
int ch = 0; ch <
s->channels; ch++)
1574 for (
int ch = 0; ch <
s->channels &&
s->st; ch++) {
1596 #define OFFSET(x) offsetof(AudioRNNContext, x)
1597 #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
1610 .description =
NULL_IF_CONFIG_SMALL(
"Reduce noise from speech using Recurrent Neural Networks."),
1612 .priv_class = &arnndn_class,