FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
af_atempo.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * tempo scaling audio filter -- an implementation of WSOLA algorithm
24  *
25  * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
26  * from Apprentice Video player by Pavel Koshevoy.
27  * https://sourceforge.net/projects/apprenticevideo/
28  *
29  * An explanation of SOLA algorithm is available at
30  * http://www.surina.net/article/time-and-pitch-scaling.html
31  *
32  * WSOLA is very similar to SOLA, only one major difference exists between
33  * these algorithms. SOLA shifts audio fragments along the output stream,
34  * where as WSOLA shifts audio fragments along the input stream.
35  *
36  * The advantage of WSOLA algorithm is that the overlap region size is
37  * always the same, therefore the blending function is constant and
38  * can be precomputed.
39  */
40 
41 #include <float.h>
42 #include "libavcodec/avfft.h"
43 #include "libavutil/avassert.h"
44 #include "libavutil/avstring.h"
46 #include "libavutil/eval.h"
47 #include "libavutil/opt.h"
48 #include "libavutil/samplefmt.h"
49 #include "avfilter.h"
50 #include "audio.h"
51 #include "internal.h"
52 
53 /**
54  * A fragment of audio waveform
55  */
56 typedef struct {
57  // index of the first sample of this fragment in the overall waveform;
58  // 0: input sample position
59  // 1: output sample position
60  int64_t position[2];
61 
62  // original packed multi-channel samples:
64 
65  // number of samples in this fragment:
66  int nsamples;
67 
68  // rDFT transform of the down-mixed mono fragment, used for
69  // fast waveform alignment via correlation in frequency domain:
72 
73 /**
74  * Filter state machine states
75  */
76 typedef enum {
82 } FilterState;
83 
84 /**
85  * Filter state machine
86  */
87 typedef struct {
88  // ring-buffer of input samples, necessary because some times
89  // input fragment position may be adjusted backwards:
91 
92  // ring-buffer maximum capacity, expressed in sample rate time base:
93  int ring;
94 
95  // ring-buffer house keeping:
96  int size;
97  int head;
98  int tail;
99 
100  // 0: input sample position corresponding to the ring buffer tail
101  // 1: output sample position
102  int64_t position[2];
103 
104  // sample format:
106 
107  // number of channels:
108  int channels;
109 
110  // row of bytes to skip from one sample to next, across multple channels;
111  // stride = (number-of-channels * bits-per-sample-per-channel) / 8
112  int stride;
113 
114  // fragment window size, power-of-two integer:
115  int window;
116 
117  // Hann window coefficients, for feathering
118  // (blending) the overlapping fragment region:
119  float *hann;
120 
121  // tempo scaling factor:
122  double tempo;
123 
124  // cumulative alignment drift:
125  int drift;
126 
127  // current/previous fragment ring-buffer:
128  AudioFragment frag[2];
129 
130  // current fragment index:
131  uint64_t nfrag;
132 
133  // current state:
135 
136  // for fast correlation calculation in frequency domain:
140 
141  // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame
146  uint64_t nsamples_in;
147  uint64_t nsamples_out;
148 } ATempoContext;
149 
150 /**
151  * Reset filter to initial state, do not deallocate existing local buffers.
152  */
153 static void yae_clear(ATempoContext *atempo)
154 {
155  atempo->size = 0;
156  atempo->head = 0;
157  atempo->tail = 0;
158 
159  atempo->drift = 0;
160  atempo->nfrag = 0;
161  atempo->state = YAE_LOAD_FRAGMENT;
162 
163  atempo->position[0] = 0;
164  atempo->position[1] = 0;
165 
166  atempo->frag[0].position[0] = 0;
167  atempo->frag[0].position[1] = 0;
168  atempo->frag[0].nsamples = 0;
169 
170  atempo->frag[1].position[0] = 0;
171  atempo->frag[1].position[1] = 0;
172  atempo->frag[1].nsamples = 0;
173 
174  // shift left position of 1st fragment by half a window
175  // so that no re-normalization would be required for
176  // the left half of the 1st fragment:
177  atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
178  atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
179 
181  atempo->dst = NULL;
182  atempo->dst_end = NULL;
183 
184  atempo->request_fulfilled = 0;
185  atempo->nsamples_in = 0;
186  atempo->nsamples_out = 0;
187 }
188 
189 /**
190  * Reset filter to initial state and deallocate all buffers.
191  */
192 static void yae_release_buffers(ATempoContext *atempo)
193 {
194  yae_clear(atempo);
195 
196  av_freep(&atempo->frag[0].data);
197  av_freep(&atempo->frag[1].data);
198  av_freep(&atempo->frag[0].xdat);
199  av_freep(&atempo->frag[1].xdat);
200 
201  av_freep(&atempo->buffer);
202  av_freep(&atempo->hann);
203  av_freep(&atempo->correlation);
204 
205  av_rdft_end(atempo->real_to_complex);
206  atempo->real_to_complex = NULL;
207 
208  av_rdft_end(atempo->complex_to_real);
209  atempo->complex_to_real = NULL;
210 }
211 
212 /* av_realloc is not aligned enough; fortunately, the data does not need to
213  * be preserved */
214 #define RE_MALLOC_OR_FAIL(field, field_size) \
215  do { \
216  av_freep(&field); \
217  field = av_malloc(field_size); \
218  if (!field) { \
219  yae_release_buffers(atempo); \
220  return AVERROR(ENOMEM); \
221  } \
222  } while (0)
223 
224 /**
225  * Prepare filter for processing audio data of given format,
226  * sample rate and number of channels.
227  */
228 static int yae_reset(ATempoContext *atempo,
229  enum AVSampleFormat format,
230  int sample_rate,
231  int channels)
232 {
233  const int sample_size = av_get_bytes_per_sample(format);
234  uint32_t nlevels = 0;
235  uint32_t pot;
236  int i;
237 
238  atempo->format = format;
239  atempo->channels = channels;
240  atempo->stride = sample_size * channels;
241 
242  // pick a segment window size:
243  atempo->window = sample_rate / 24;
244 
245  // adjust window size to be a power-of-two integer:
246  nlevels = av_log2(atempo->window);
247  pot = 1 << nlevels;
248  av_assert0(pot <= atempo->window);
249 
250  if (pot < atempo->window) {
251  atempo->window = pot * 2;
252  nlevels++;
253  }
254 
255  // initialize audio fragment buffers:
256  RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
257  RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
258  RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
259  RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
260 
261  // initialize rDFT contexts:
262  av_rdft_end(atempo->real_to_complex);
263  atempo->real_to_complex = NULL;
264 
265  av_rdft_end(atempo->complex_to_real);
266  atempo->complex_to_real = NULL;
267 
268  atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
269  if (!atempo->real_to_complex) {
270  yae_release_buffers(atempo);
271  return AVERROR(ENOMEM);
272  }
273 
274  atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
275  if (!atempo->complex_to_real) {
276  yae_release_buffers(atempo);
277  return AVERROR(ENOMEM);
278  }
279 
280  RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
281 
282  atempo->ring = atempo->window * 3;
283  RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
284 
285  // initialize the Hann window function:
286  RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
287 
288  for (i = 0; i < atempo->window; i++) {
289  double t = (double)i / (double)(atempo->window - 1);
290  double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
291  atempo->hann[i] = (float)h;
292  }
293 
294  yae_clear(atempo);
295  return 0;
296 }
297 
298 static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
299 {
300  ATempoContext *atempo = ctx->priv;
301  char *tail = NULL;
302  double tempo = av_strtod(arg_tempo, &tail);
303 
304  if (tail && *tail) {
305  av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
306  return AVERROR(EINVAL);
307  }
308 
309  if (tempo < 0.5 || tempo > 2.0) {
310  av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
311  tempo);
312  return AVERROR(EINVAL);
313  }
314 
315  atempo->tempo = tempo;
316  return 0;
317 }
318 
320 {
321  return &atempo->frag[atempo->nfrag % 2];
322 }
323 
325 {
326  return &atempo->frag[(atempo->nfrag + 1) % 2];
327 }
328 
329 /**
330  * A helper macro for initializing complex data buffer with scalar data
331  * of a given type.
332  */
333 #define yae_init_xdat(scalar_type, scalar_max) \
334  do { \
335  const uint8_t *src_end = src + \
336  frag->nsamples * atempo->channels * sizeof(scalar_type); \
337  \
338  FFTSample *xdat = frag->xdat; \
339  scalar_type tmp; \
340  \
341  if (atempo->channels == 1) { \
342  for (; src < src_end; xdat++) { \
343  tmp = *(const scalar_type *)src; \
344  src += sizeof(scalar_type); \
345  \
346  *xdat = (FFTSample)tmp; \
347  } \
348  } else { \
349  FFTSample s, max, ti, si; \
350  int i; \
351  \
352  for (; src < src_end; xdat++) { \
353  tmp = *(const scalar_type *)src; \
354  src += sizeof(scalar_type); \
355  \
356  max = (FFTSample)tmp; \
357  s = FFMIN((FFTSample)scalar_max, \
358  (FFTSample)fabsf(max)); \
359  \
360  for (i = 1; i < atempo->channels; i++) { \
361  tmp = *(const scalar_type *)src; \
362  src += sizeof(scalar_type); \
363  \
364  ti = (FFTSample)tmp; \
365  si = FFMIN((FFTSample)scalar_max, \
366  (FFTSample)fabsf(ti)); \
367  \
368  if (s < si) { \
369  s = si; \
370  max = ti; \
371  } \
372  } \
373  \
374  *xdat = max; \
375  } \
376  } \
377  } while (0)
378 
379 /**
380  * Initialize complex data buffer of a given audio fragment
381  * with down-mixed mono data of appropriate scalar type.
382  */
383 static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
384 {
385  // shortcuts:
386  const uint8_t *src = frag->data;
387 
388  // init complex data buffer used for FFT and Correlation:
389  memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
390 
391  if (atempo->format == AV_SAMPLE_FMT_U8) {
392  yae_init_xdat(uint8_t, 127);
393  } else if (atempo->format == AV_SAMPLE_FMT_S16) {
394  yae_init_xdat(int16_t, 32767);
395  } else if (atempo->format == AV_SAMPLE_FMT_S32) {
396  yae_init_xdat(int, 2147483647);
397  } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
398  yae_init_xdat(float, 1);
399  } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
400  yae_init_xdat(double, 1);
401  }
402 }
403 
404 /**
405  * Populate the internal data buffer on as-needed basis.
406  *
407  * @return
408  * 0 if requested data was already available or was successfully loaded,
409  * AVERROR(EAGAIN) if more input data is required.
410  */
411 static int yae_load_data(ATempoContext *atempo,
412  const uint8_t **src_ref,
413  const uint8_t *src_end,
414  int64_t stop_here)
415 {
416  // shortcut:
417  const uint8_t *src = *src_ref;
418  const int read_size = stop_here - atempo->position[0];
419 
420  if (stop_here <= atempo->position[0]) {
421  return 0;
422  }
423 
424  // samples are not expected to be skipped:
425  av_assert0(read_size <= atempo->ring);
426 
427  while (atempo->position[0] < stop_here && src < src_end) {
428  int src_samples = (src_end - src) / atempo->stride;
429 
430  // load data piece-wise, in order to avoid complicating the logic:
431  int nsamples = FFMIN(read_size, src_samples);
432  int na;
433  int nb;
434 
435  nsamples = FFMIN(nsamples, atempo->ring);
436  na = FFMIN(nsamples, atempo->ring - atempo->tail);
437  nb = FFMIN(nsamples - na, atempo->ring);
438 
439  if (na) {
440  uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
441  memcpy(a, src, na * atempo->stride);
442 
443  src += na * atempo->stride;
444  atempo->position[0] += na;
445 
446  atempo->size = FFMIN(atempo->size + na, atempo->ring);
447  atempo->tail = (atempo->tail + na) % atempo->ring;
448  atempo->head =
449  atempo->size < atempo->ring ?
450  atempo->tail - atempo->size :
451  atempo->tail;
452  }
453 
454  if (nb) {
455  uint8_t *b = atempo->buffer;
456  memcpy(b, src, nb * atempo->stride);
457 
458  src += nb * atempo->stride;
459  atempo->position[0] += nb;
460 
461  atempo->size = FFMIN(atempo->size + nb, atempo->ring);
462  atempo->tail = (atempo->tail + nb) % atempo->ring;
463  atempo->head =
464  atempo->size < atempo->ring ?
465  atempo->tail - atempo->size :
466  atempo->tail;
467  }
468  }
469 
470  // pass back the updated source buffer pointer:
471  *src_ref = src;
472 
473  // sanity check:
474  av_assert0(atempo->position[0] <= stop_here);
475 
476  return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
477 }
478 
479 /**
480  * Populate current audio fragment data buffer.
481  *
482  * @return
483  * 0 when the fragment is ready,
484  * AVERROR(EAGAIN) if more input data is required.
485  */
486 static int yae_load_frag(ATempoContext *atempo,
487  const uint8_t **src_ref,
488  const uint8_t *src_end)
489 {
490  // shortcuts:
491  AudioFragment *frag = yae_curr_frag(atempo);
492  uint8_t *dst;
493  int64_t missing, start, zeros;
494  uint32_t nsamples;
495  const uint8_t *a, *b;
496  int i0, i1, n0, n1, na, nb;
497 
498  int64_t stop_here = frag->position[0] + atempo->window;
499  if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
500  return AVERROR(EAGAIN);
501  }
502 
503  // calculate the number of samples we don't have:
504  missing =
505  stop_here > atempo->position[0] ?
506  stop_here - atempo->position[0] : 0;
507 
508  nsamples =
509  missing < (int64_t)atempo->window ?
510  (uint32_t)(atempo->window - missing) : 0;
511 
512  // setup the output buffer:
513  frag->nsamples = nsamples;
514  dst = frag->data;
515 
516  start = atempo->position[0] - atempo->size;
517  zeros = 0;
518 
519  if (frag->position[0] < start) {
520  // what we don't have we substitute with zeros:
521  zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
522  av_assert0(zeros != nsamples);
523 
524  memset(dst, 0, zeros * atempo->stride);
525  dst += zeros * atempo->stride;
526  }
527 
528  if (zeros == nsamples) {
529  return 0;
530  }
531 
532  // get the remaining data from the ring buffer:
533  na = (atempo->head < atempo->tail ?
534  atempo->tail - atempo->head :
535  atempo->ring - atempo->head);
536 
537  nb = atempo->head < atempo->tail ? 0 : atempo->tail;
538 
539  // sanity check:
540  av_assert0(nsamples <= zeros + na + nb);
541 
542  a = atempo->buffer + atempo->head * atempo->stride;
543  b = atempo->buffer;
544 
545  i0 = frag->position[0] + zeros - start;
546  i1 = i0 < na ? 0 : i0 - na;
547 
548  n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
549  n1 = nsamples - zeros - n0;
550 
551  if (n0) {
552  memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
553  dst += n0 * atempo->stride;
554  }
555 
556  if (n1) {
557  memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
558  }
559 
560  return 0;
561 }
562 
563 /**
564  * Prepare for loading next audio fragment.
565  */
567 {
568  const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
569 
570  const AudioFragment *prev;
571  AudioFragment *frag;
572 
573  atempo->nfrag++;
574  prev = yae_prev_frag(atempo);
575  frag = yae_curr_frag(atempo);
576 
577  frag->position[0] = prev->position[0] + (int64_t)fragment_step;
578  frag->position[1] = prev->position[1] + atempo->window / 2;
579  frag->nsamples = 0;
580 }
581 
582 /**
583  * Calculate cross-correlation via rDFT.
584  *
585  * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
586  * and transform back via complex_to_real rDFT.
587  */
588 static void yae_xcorr_via_rdft(FFTSample *xcorr,
589  RDFTContext *complex_to_real,
590  const FFTComplex *xa,
591  const FFTComplex *xb,
592  const int window)
593 {
594  FFTComplex *xc = (FFTComplex *)xcorr;
595  int i;
596 
597  // NOTE: first element requires special care -- Given Y = rDFT(X),
598  // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
599  // stores Re(Y[N/2]) in place of Im(Y[0]).
600 
601  xc->re = xa->re * xb->re;
602  xc->im = xa->im * xb->im;
603  xa++;
604  xb++;
605  xc++;
606 
607  for (i = 1; i < window; i++, xa++, xb++, xc++) {
608  xc->re = (xa->re * xb->re + xa->im * xb->im);
609  xc->im = (xa->im * xb->re - xa->re * xb->im);
610  }
611 
612  // apply inverse rDFT:
613  av_rdft_calc(complex_to_real, xcorr);
614 }
615 
616 /**
617  * Calculate alignment offset for given fragment
618  * relative to the previous fragment.
619  *
620  * @return alignment offset of current fragment relative to previous.
621  */
622 static int yae_align(AudioFragment *frag,
623  const AudioFragment *prev,
624  const int window,
625  const int delta_max,
626  const int drift,
627  FFTSample *correlation,
628  RDFTContext *complex_to_real)
629 {
630  int best_offset = -drift;
631  FFTSample best_metric = -FLT_MAX;
632  FFTSample *xcorr;
633 
634  int i0;
635  int i1;
636  int i;
637 
638  yae_xcorr_via_rdft(correlation,
639  complex_to_real,
640  (const FFTComplex *)prev->xdat,
641  (const FFTComplex *)frag->xdat,
642  window);
643 
644  // identify search window boundaries:
645  i0 = FFMAX(window / 2 - delta_max - drift, 0);
646  i0 = FFMIN(i0, window);
647 
648  i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
649  i1 = FFMAX(i1, 0);
650 
651  // identify cross-correlation peaks within search window:
652  xcorr = correlation + i0;
653 
654  for (i = i0; i < i1; i++, xcorr++) {
655  FFTSample metric = *xcorr;
656 
657  // normalize:
658  FFTSample drifti = (FFTSample)(drift + i);
659  metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
660 
661  if (metric > best_metric) {
662  best_metric = metric;
663  best_offset = i - window / 2;
664  }
665  }
666 
667  return best_offset;
668 }
669 
670 /**
671  * Adjust current fragment position for better alignment
672  * with previous fragment.
673  *
674  * @return alignment correction.
675  */
677 {
678  const AudioFragment *prev = yae_prev_frag(atempo);
679  AudioFragment *frag = yae_curr_frag(atempo);
680 
681  const int delta_max = atempo->window / 2;
682  const int correction = yae_align(frag,
683  prev,
684  atempo->window,
685  delta_max,
686  atempo->drift,
687  atempo->correlation,
688  atempo->complex_to_real);
689 
690  if (correction) {
691  // adjust fragment position:
692  frag->position[0] -= correction;
693 
694  // clear so that the fragment can be reloaded:
695  frag->nsamples = 0;
696 
697  // update cumulative correction drift counter:
698  atempo->drift += correction;
699  }
700 
701  return correction;
702 }
703 
704 /**
705  * A helper macro for blending the overlap region of previous
706  * and current audio fragment.
707  */
708 #define yae_blend(scalar_type) \
709  do { \
710  const scalar_type *aaa = (const scalar_type *)a; \
711  const scalar_type *bbb = (const scalar_type *)b; \
712  \
713  scalar_type *out = (scalar_type *)dst; \
714  scalar_type *out_end = (scalar_type *)dst_end; \
715  int64_t i; \
716  \
717  for (i = 0; i < overlap && out < out_end; \
718  i++, atempo->position[1]++, wa++, wb++) { \
719  float w0 = *wa; \
720  float w1 = *wb; \
721  int j; \
722  \
723  for (j = 0; j < atempo->channels; \
724  j++, aaa++, bbb++, out++) { \
725  float t0 = (float)*aaa; \
726  float t1 = (float)*bbb; \
727  \
728  *out = \
729  frag->position[0] + i < 0 ? \
730  *aaa : \
731  (scalar_type)(t0 * w0 + t1 * w1); \
732  } \
733  } \
734  dst = (uint8_t *)out; \
735  } while (0)
736 
737 /**
738  * Blend the overlap region of previous and current audio fragment
739  * and output the results to the given destination buffer.
740  *
741  * @return
742  * 0 if the overlap region was completely stored in the dst buffer,
743  * AVERROR(EAGAIN) if more destination buffer space is required.
744  */
745 static int yae_overlap_add(ATempoContext *atempo,
746  uint8_t **dst_ref,
747  uint8_t *dst_end)
748 {
749  // shortcuts:
750  const AudioFragment *prev = yae_prev_frag(atempo);
751  const AudioFragment *frag = yae_curr_frag(atempo);
752 
753  const int64_t start_here = FFMAX(atempo->position[1],
754  frag->position[1]);
755 
756  const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
757  frag->position[1] + frag->nsamples);
758 
759  const int64_t overlap = stop_here - start_here;
760 
761  const int64_t ia = start_here - prev->position[1];
762  const int64_t ib = start_here - frag->position[1];
763 
764  const float *wa = atempo->hann + ia;
765  const float *wb = atempo->hann + ib;
766 
767  const uint8_t *a = prev->data + ia * atempo->stride;
768  const uint8_t *b = frag->data + ib * atempo->stride;
769 
770  uint8_t *dst = *dst_ref;
771 
772  av_assert0(start_here <= stop_here &&
773  frag->position[1] <= start_here &&
774  overlap <= frag->nsamples);
775 
776  if (atempo->format == AV_SAMPLE_FMT_U8) {
778  } else if (atempo->format == AV_SAMPLE_FMT_S16) {
779  yae_blend(int16_t);
780  } else if (atempo->format == AV_SAMPLE_FMT_S32) {
781  yae_blend(int);
782  } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
783  yae_blend(float);
784  } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
785  yae_blend(double);
786  }
787 
788  // pass-back the updated destination buffer pointer:
789  *dst_ref = dst;
790 
791  return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
792 }
793 
794 /**
795  * Feed as much data to the filter as it is able to consume
796  * and receive as much processed data in the destination buffer
797  * as it is able to produce or store.
798  */
799 static void
801  const uint8_t **src_ref,
802  const uint8_t *src_end,
803  uint8_t **dst_ref,
804  uint8_t *dst_end)
805 {
806  while (1) {
807  if (atempo->state == YAE_LOAD_FRAGMENT) {
808  // load additional data for the current fragment:
809  if (yae_load_frag(atempo, src_ref, src_end) != 0) {
810  break;
811  }
812 
813  // down-mix to mono:
814  yae_downmix(atempo, yae_curr_frag(atempo));
815 
816  // apply rDFT:
817  av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
818 
819  // must load the second fragment before alignment can start:
820  if (!atempo->nfrag) {
821  yae_advance_to_next_frag(atempo);
822  continue;
823  }
824 
825  atempo->state = YAE_ADJUST_POSITION;
826  }
827 
828  if (atempo->state == YAE_ADJUST_POSITION) {
829  // adjust position for better alignment:
830  if (yae_adjust_position(atempo)) {
831  // reload the fragment at the corrected position, so that the
832  // Hann window blending would not require normalization:
833  atempo->state = YAE_RELOAD_FRAGMENT;
834  } else {
835  atempo->state = YAE_OUTPUT_OVERLAP_ADD;
836  }
837  }
838 
839  if (atempo->state == YAE_RELOAD_FRAGMENT) {
840  // load additional data if necessary due to position adjustment:
841  if (yae_load_frag(atempo, src_ref, src_end) != 0) {
842  break;
843  }
844 
845  // down-mix to mono:
846  yae_downmix(atempo, yae_curr_frag(atempo));
847 
848  // apply rDFT:
849  av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
850 
851  atempo->state = YAE_OUTPUT_OVERLAP_ADD;
852  }
853 
854  if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
855  // overlap-add and output the result:
856  if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
857  break;
858  }
859 
860  // advance to the next fragment, repeat:
861  yae_advance_to_next_frag(atempo);
862  atempo->state = YAE_LOAD_FRAGMENT;
863  }
864  }
865 }
866 
867 /**
868  * Flush any buffered data from the filter.
869  *
870  * @return
871  * 0 if all data was completely stored in the dst buffer,
872  * AVERROR(EAGAIN) if more destination buffer space is required.
873  */
874 static int yae_flush(ATempoContext *atempo,
875  uint8_t **dst_ref,
876  uint8_t *dst_end)
877 {
878  AudioFragment *frag = yae_curr_frag(atempo);
879  int64_t overlap_end;
880  int64_t start_here;
881  int64_t stop_here;
882  int64_t offset;
883 
884  const uint8_t *src;
885  uint8_t *dst;
886 
887  int src_size;
888  int dst_size;
889  int nbytes;
890 
891  atempo->state = YAE_FLUSH_OUTPUT;
892 
893  if (atempo->position[0] == frag->position[0] + frag->nsamples &&
894  atempo->position[1] == frag->position[1] + frag->nsamples) {
895  // the current fragment is already flushed:
896  return 0;
897  }
898 
899  if (frag->position[0] + frag->nsamples < atempo->position[0]) {
900  // finish loading the current (possibly partial) fragment:
901  yae_load_frag(atempo, NULL, NULL);
902 
903  if (atempo->nfrag) {
904  // down-mix to mono:
905  yae_downmix(atempo, frag);
906 
907  // apply rDFT:
908  av_rdft_calc(atempo->real_to_complex, frag->xdat);
909 
910  // align current fragment to previous fragment:
911  if (yae_adjust_position(atempo)) {
912  // reload the current fragment due to adjusted position:
913  yae_load_frag(atempo, NULL, NULL);
914  }
915  }
916  }
917 
918  // flush the overlap region:
919  overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
920  frag->nsamples);
921 
922  while (atempo->position[1] < overlap_end) {
923  if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
924  return AVERROR(EAGAIN);
925  }
926  }
927 
928  // flush the remaininder of the current fragment:
929  start_here = FFMAX(atempo->position[1], overlap_end);
930  stop_here = frag->position[1] + frag->nsamples;
931  offset = start_here - frag->position[1];
932  av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
933 
934  src = frag->data + offset * atempo->stride;
935  dst = (uint8_t *)*dst_ref;
936 
937  src_size = (int)(stop_here - start_here) * atempo->stride;
938  dst_size = dst_end - dst;
939  nbytes = FFMIN(src_size, dst_size);
940 
941  memcpy(dst, src, nbytes);
942  dst += nbytes;
943 
944  atempo->position[1] += (nbytes / atempo->stride);
945 
946  // pass-back the updated destination buffer pointer:
947  *dst_ref = (uint8_t *)dst;
948 
949  return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
950 }
951 
952 static av_cold int init(AVFilterContext *ctx, const char *args)
953 {
954  ATempoContext *atempo = ctx->priv;
955 
956  // NOTE: this assumes that the caller has memset ctx->priv to 0:
957  atempo->format = AV_SAMPLE_FMT_NONE;
958  atempo->tempo = 1.0;
959  atempo->state = YAE_LOAD_FRAGMENT;
960 
961  return args ? yae_set_tempo(ctx, args) : 0;
962 }
963 
964 static av_cold void uninit(AVFilterContext *ctx)
965 {
966  ATempoContext *atempo = ctx->priv;
967  yae_release_buffers(atempo);
968 }
969 
971 {
974 
975  // WSOLA necessitates an internal sliding window ring buffer
976  // for incoming audio stream.
977  //
978  // Planar sample formats are too cumbersome to store in a ring buffer,
979  // therefore planar sample formats are not supported.
980  //
981  static const enum AVSampleFormat sample_fmts[] = {
988  };
989 
990  layouts = ff_all_channel_layouts();
991  if (!layouts) {
992  return AVERROR(ENOMEM);
993  }
994  ff_set_common_channel_layouts(ctx, layouts);
995 
996  formats = ff_make_format_list(sample_fmts);
997  if (!formats) {
998  return AVERROR(ENOMEM);
999  }
1000  ff_set_common_formats(ctx, formats);
1001 
1002  formats = ff_all_samplerates();
1003  if (!formats) {
1004  return AVERROR(ENOMEM);
1005  }
1006  ff_set_common_samplerates(ctx, formats);
1007 
1008  return 0;
1009 }
1010 
1011 static int config_props(AVFilterLink *inlink)
1012 {
1013  AVFilterContext *ctx = inlink->dst;
1014  ATempoContext *atempo = ctx->priv;
1015 
1016  enum AVSampleFormat format = inlink->format;
1017  int sample_rate = (int)inlink->sample_rate;
1018  int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
1019 
1020  return yae_reset(atempo, format, sample_rate, channels);
1021 }
1022 
1023 static void push_samples(ATempoContext *atempo,
1024  AVFilterLink *outlink,
1025  int n_out)
1026 {
1027  atempo->dst_buffer->audio->sample_rate = outlink->sample_rate;
1028  atempo->dst_buffer->audio->nb_samples = n_out;
1029 
1030  // adjust the PTS:
1031  atempo->dst_buffer->pts =
1032  av_rescale_q(atempo->nsamples_out,
1033  (AVRational){ 1, outlink->sample_rate },
1034  outlink->time_base);
1035 
1036  ff_filter_frame(outlink, atempo->dst_buffer);
1037  atempo->dst_buffer = NULL;
1038  atempo->dst = NULL;
1039  atempo->dst_end = NULL;
1040 
1041  atempo->nsamples_out += n_out;
1042 }
1043 
1044 static int filter_frame(AVFilterLink *inlink,
1045  AVFilterBufferRef *src_buffer)
1046 {
1047  AVFilterContext *ctx = inlink->dst;
1048  ATempoContext *atempo = ctx->priv;
1049  AVFilterLink *outlink = ctx->outputs[0];
1050 
1051  int n_in = src_buffer->audio->nb_samples;
1052  int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
1053 
1054  const uint8_t *src = src_buffer->data[0];
1055  const uint8_t *src_end = src + n_in * atempo->stride;
1056 
1057  while (src < src_end) {
1058  if (!atempo->dst_buffer) {
1059  atempo->dst_buffer = ff_get_audio_buffer(outlink,
1060  AV_PERM_WRITE,
1061  n_out);
1062  avfilter_copy_buffer_ref_props(atempo->dst_buffer, src_buffer);
1063 
1064  atempo->dst = atempo->dst_buffer->data[0];
1065  atempo->dst_end = atempo->dst + n_out * atempo->stride;
1066  }
1067 
1068  yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
1069 
1070  if (atempo->dst == atempo->dst_end) {
1071  push_samples(atempo, outlink, n_out);
1072  atempo->request_fulfilled = 1;
1073  }
1074  }
1075 
1076  atempo->nsamples_in += n_in;
1077  avfilter_unref_bufferp(&src_buffer);
1078  return 0;
1079 }
1080 
1081 static int request_frame(AVFilterLink *outlink)
1082 {
1083  AVFilterContext *ctx = outlink->src;
1084  ATempoContext *atempo = ctx->priv;
1085  int ret;
1086 
1087  atempo->request_fulfilled = 0;
1088  do {
1089  ret = ff_request_frame(ctx->inputs[0]);
1090  }
1091  while (!atempo->request_fulfilled && ret >= 0);
1092 
1093  if (ret == AVERROR_EOF) {
1094  // flush the filter:
1095  int n_max = atempo->ring;
1096  int n_out;
1097  int err = AVERROR(EAGAIN);
1098 
1099  while (err == AVERROR(EAGAIN)) {
1100  if (!atempo->dst_buffer) {
1101  atempo->dst_buffer = ff_get_audio_buffer(outlink,
1102  AV_PERM_WRITE,
1103  n_max);
1104 
1105  atempo->dst = atempo->dst_buffer->data[0];
1106  atempo->dst_end = atempo->dst + n_max * atempo->stride;
1107  }
1108 
1109  err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
1110 
1111  n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
1112  atempo->stride);
1113 
1114  if (n_out) {
1115  push_samples(atempo, outlink, n_out);
1116  }
1117  }
1118 
1120  atempo->dst = NULL;
1121  atempo->dst_end = NULL;
1122 
1123  return AVERROR_EOF;
1124  }
1125 
1126  return ret;
1127 }
1128 
1130  const char *cmd,
1131  const char *arg,
1132  char *res,
1133  int res_len,
1134  int flags)
1135 {
1136  return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
1137 }
1138 
1139 static const AVFilterPad atempo_inputs[] = {
1140  {
1141  .name = "default",
1142  .type = AVMEDIA_TYPE_AUDIO,
1143  .filter_frame = filter_frame,
1144  .config_props = config_props,
1145  .min_perms = AV_PERM_READ,
1146  },
1147  { NULL }
1148 };
1149 
1150 static const AVFilterPad atempo_outputs[] = {
1151  {
1152  .name = "default",
1153  .request_frame = request_frame,
1154  .type = AVMEDIA_TYPE_AUDIO,
1155  },
1156  { NULL }
1157 };
1158 
1160  .name = "atempo",
1161  .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
1162  .init = init,
1163  .uninit = uninit,
1164  .query_formats = query_formats,
1165  .process_command = process_command,
1166  .priv_size = sizeof(ATempoContext),
1167  .inputs = atempo_inputs,
1168  .outputs = atempo_outputs,
1169 };