FFmpeg: libavfilter/af_atempo.c Source File

00001 /*
00002  * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00041 #include <float.h>
00042 #include "libavcodec/avfft.h"
00043 #include "libavutil/audioconvert.h"
00044 #include "libavutil/avassert.h"
00045 #include "libavutil/avstring.h"
00046 #include "libavutil/eval.h"
00047 #include "libavutil/opt.h"
00048 #include "libavutil/samplefmt.h"
00049 #include "avfilter.h"
00050 #include "audio.h"
00051 #include "internal.h"
00052 
00056 typedef struct {
00057     // index of the first sample of this fragment in the overall waveform;
00058     // 0: input sample position
00059     // 1: output sample position
00060     int64_t position[2];
00061 
00062     // original packed multi-channel samples:
00063     uint8_t *data;
00064 
00065     // number of samples in this fragment:
00066     int nsamples;
00067 
00068     // rDFT transform of the down-mixed mono fragment, used for
00069     // fast waveform alignment via correlation in frequency domain:
00070     FFTSample *xdat;
00071 } AudioFragment;
00072 
00076 typedef enum {
00077     YAE_LOAD_FRAGMENT,
00078     YAE_ADJUST_POSITION,
00079     YAE_RELOAD_FRAGMENT,
00080     YAE_OUTPUT_OVERLAP_ADD,
00081     YAE_FLUSH_OUTPUT,
00082 } FilterState;
00083 
00087 typedef struct {
00088     // ring-buffer of input samples, necessary because some times
00089     // input fragment position may be adjusted backwards:
00090     uint8_t *buffer;
00091 
00092     // ring-buffer maximum capacity, expressed in sample rate time base:
00093     int ring;
00094 
00095     // ring-buffer house keeping:
00096     int size;
00097     int head;
00098     int tail;
00099 
00100     // 0: input sample position corresponding to the ring buffer tail
00101     // 1: output sample position
00102     int64_t position[2];
00103 
00104     // sample format:
00105     enum AVSampleFormat format;
00106 
00107     // number of channels:
00108     int channels;
00109 
00110     // row of bytes to skip from one sample to next, across multple channels;
00111     // stride = (number-of-channels * bits-per-sample-per-channel) / 8
00112     int stride;
00113 
00114     // fragment window size, power-of-two integer:
00115     int window;
00116 
00117     // Hann window coefficients, for feathering
00118     // (blending) the overlapping fragment region:
00119     float *hann;
00120 
00121     // tempo scaling factor:
00122     double tempo;
00123 
00124     // cumulative alignment drift:
00125     int drift;
00126 
00127     // current/previous fragment ring-buffer:
00128     AudioFragment frag[2];
00129 
00130     // current fragment index:
00131     uint64_t nfrag;
00132 
00133     // current state:
00134     FilterState state;
00135 
00136     // for fast correlation calculation in frequency domain:
00137     RDFTContext *real_to_complex;
00138     RDFTContext *complex_to_real;
00139     FFTSample *correlation;
00140 
00141     // for managing AVFilterPad.request_frame and AVFilterPad.filter_samples
00142     int request_fulfilled;
00143     AVFilterBufferRef *dst_buffer;
00144     uint8_t *dst;
00145     uint8_t *dst_end;
00146     uint64_t nsamples_in;
00147     uint64_t nsamples_out;
00148 } ATempoContext;
00149 
00153 static void yae_clear(ATempoContext *atempo)
00154 {
00155     atempo->size = 0;
00156     atempo->head = 0;
00157     atempo->tail = 0;
00158 
00159     atempo->drift = 0;
00160     atempo->nfrag = 0;
00161     atempo->state = YAE_LOAD_FRAGMENT;
00162 
00163     atempo->position[0] = 0;
00164     atempo->position[1] = 0;
00165 
00166     atempo->frag[0].position[0] = 0;
00167     atempo->frag[0].position[1] = 0;
00168     atempo->frag[0].nsamples    = 0;
00169 
00170     atempo->frag[1].position[0] = 0;
00171     atempo->frag[1].position[1] = 0;
00172     atempo->frag[1].nsamples    = 0;
00173 
00174     // shift left position of 1st fragment by half a window
00175     // so that no re-normalization would be required for
00176     // the left half of the 1st fragment:
00177     atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
00178     atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
00179 
00180     avfilter_unref_bufferp(&atempo->dst_buffer);
00181     atempo->dst     = NULL;
00182     atempo->dst_end = NULL;
00183 
00184     atempo->request_fulfilled = 0;
00185     atempo->nsamples_in       = 0;
00186     atempo->nsamples_out      = 0;
00187 }
00188 
00192 static void yae_release_buffers(ATempoContext *atempo)
00193 {
00194     yae_clear(atempo);
00195 
00196     av_freep(&atempo->frag[0].data);
00197     av_freep(&atempo->frag[1].data);
00198     av_freep(&atempo->frag[0].xdat);
00199     av_freep(&atempo->frag[1].xdat);
00200 
00201     av_freep(&atempo->buffer);
00202     av_freep(&atempo->hann);
00203     av_freep(&atempo->correlation);
00204 
00205     av_rdft_end(atempo->real_to_complex);
00206     atempo->real_to_complex = NULL;
00207 
00208     av_rdft_end(atempo->complex_to_real);
00209     atempo->complex_to_real = NULL;
00210 }
00211 
00212 /* av_realloc is not aligned enough; fortunately, the data does not need to
00213  * be preserved */
00214 #define RE_MALLOC_OR_FAIL(field, field_size)                    \
00215     do {                                                        \
00216         av_freep(&field);                                       \
00217         field = av_malloc(field_size);                          \
00218         if (!field) {                                           \
00219             yae_release_buffers(atempo);                        \
00220             return AVERROR(ENOMEM);                             \
00221         }                                                       \
00222     } while (0)
00223 
00228 static int yae_reset(ATempoContext *atempo,
00229                      enum AVSampleFormat format,
00230                      int sample_rate,
00231                      int channels)
00232 {
00233     const int sample_size = av_get_bytes_per_sample(format);
00234     uint32_t nlevels  = 0;
00235     uint32_t pot;
00236     int i;
00237 
00238     atempo->format   = format;
00239     atempo->channels = channels;
00240     atempo->stride   = sample_size * channels;
00241 
00242     // pick a segment window size:
00243     atempo->window = sample_rate / 24;
00244 
00245     // adjust window size to be a power-of-two integer:
00246     nlevels = av_log2(atempo->window);
00247     pot = 1 << nlevels;
00248     av_assert0(pot <= atempo->window);
00249 
00250     if (pot < atempo->window) {
00251         atempo->window = pot * 2;
00252         nlevels++;
00253     }
00254 
00255     // initialize audio fragment buffers:
00256     RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
00257     RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
00258     RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
00259     RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
00260 
00261     // initialize rDFT contexts:
00262     av_rdft_end(atempo->real_to_complex);
00263     atempo->real_to_complex = NULL;
00264 
00265     av_rdft_end(atempo->complex_to_real);
00266     atempo->complex_to_real = NULL;
00267 
00268     atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
00269     if (!atempo->real_to_complex) {
00270         yae_release_buffers(atempo);
00271         return AVERROR(ENOMEM);
00272     }
00273 
00274     atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
00275     if (!atempo->complex_to_real) {
00276         yae_release_buffers(atempo);
00277         return AVERROR(ENOMEM);
00278     }
00279 
00280     RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
00281 
00282     atempo->ring = atempo->window * 3;
00283     RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
00284 
00285     // initialize the Hann window function:
00286     RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
00287 
00288     for (i = 0; i < atempo->window; i++) {
00289         double t = (double)i / (double)(atempo->window - 1);
00290         double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
00291         atempo->hann[i] = (float)h;
00292     }
00293 
00294     yae_clear(atempo);
00295     return 0;
00296 }
00297 
00298 static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
00299 {
00300     ATempoContext *atempo = ctx->priv;
00301     char   *tail = NULL;
00302     double tempo = av_strtod(arg_tempo, &tail);
00303 
00304     if (tail && *tail) {
00305         av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
00306         return AVERROR(EINVAL);
00307     }
00308 
00309     if (tempo < 0.5 || tempo > 2.0) {
00310         av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
00311                tempo);
00312         return AVERROR(EINVAL);
00313     }
00314 
00315     atempo->tempo = tempo;
00316     return 0;
00317 }
00318 
00319 inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
00320 {
00321     return &atempo->frag[atempo->nfrag % 2];
00322 }
00323 
00324 inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
00325 {
00326     return &atempo->frag[(atempo->nfrag + 1) % 2];
00327 }
00328 
00333 #define yae_init_xdat(scalar_type, scalar_max)                          \
00334     do {                                                                \
00335         const uint8_t *src_end = src +                                  \
00336             frag->nsamples * atempo->channels * sizeof(scalar_type);    \
00337                                                                         \
00338         FFTSample *xdat = frag->xdat;                                   \
00339         scalar_type tmp;                                                \
00340                                                                         \
00341         if (atempo->channels == 1) {                                    \
00342             for (; src < src_end; xdat++) {                             \
00343                 tmp = *(const scalar_type *)src;                        \
00344                 src += sizeof(scalar_type);                             \
00345                                                                         \
00346                 *xdat = (FFTSample)tmp;                                 \
00347             }                                                           \
00348         } else {                                                        \
00349             FFTSample s, max, ti, si;                                   \
00350             int i;                                                      \
00351                                                                         \
00352             for (; src < src_end; xdat++) {                             \
00353                 tmp = *(const scalar_type *)src;                        \
00354                 src += sizeof(scalar_type);                             \
00355                                                                         \
00356                 max = (FFTSample)tmp;                                   \
00357                 s = FFMIN((FFTSample)scalar_max,                        \
00358                           (FFTSample)fabsf(max));                       \
00359                                                                         \
00360                 for (i = 1; i < atempo->channels; i++) {                \
00361                     tmp = *(const scalar_type *)src;                    \
00362                     src += sizeof(scalar_type);                         \
00363                                                                         \
00364                     ti = (FFTSample)tmp;                                \
00365                     si = FFMIN((FFTSample)scalar_max,                   \
00366                                (FFTSample)fabsf(ti));                   \
00367                                                                         \
00368                     if (s < si) {                                       \
00369                         s   = si;                                       \
00370                         max = ti;                                       \
00371                     }                                                   \
00372                 }                                                       \
00373                                                                         \
00374                 *xdat = max;                                            \
00375             }                                                           \
00376         }                                                               \
00377     } while (0)
00378 
00383 static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
00384 {
00385     // shortcuts:
00386     const uint8_t *src = frag->data;
00387 
00388     // init complex data buffer used for FFT and Correlation:
00389     memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
00390 
00391     if (atempo->format == AV_SAMPLE_FMT_U8) {
00392         yae_init_xdat(uint8_t, 127);
00393     } else if (atempo->format == AV_SAMPLE_FMT_S16) {
00394         yae_init_xdat(int16_t, 32767);
00395     } else if (atempo->format == AV_SAMPLE_FMT_S32) {
00396         yae_init_xdat(int, 2147483647);
00397     } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
00398         yae_init_xdat(float, 1);
00399     } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
00400         yae_init_xdat(double, 1);
00401     }
00402 }
00403 
00411 static int yae_load_data(ATempoContext *atempo,
00412                          const uint8_t **src_ref,
00413                          const uint8_t *src_end,
00414                          int64_t stop_here)
00415 {
00416     // shortcut:
00417     const uint8_t *src = *src_ref;
00418     const int read_size = stop_here - atempo->position[0];
00419 
00420     if (stop_here <= atempo->position[0]) {
00421         return 0;
00422     }
00423 
00424     // samples are not expected to be skipped:
00425     av_assert0(read_size <= atempo->ring);
00426 
00427     while (atempo->position[0] < stop_here && src < src_end) {
00428         int src_samples = (src_end - src) / atempo->stride;
00429 
00430         // load data piece-wise, in order to avoid complicating the logic:
00431         int nsamples = FFMIN(read_size, src_samples);
00432         int na;
00433         int nb;
00434 
00435         nsamples = FFMIN(nsamples, atempo->ring);
00436         na = FFMIN(nsamples, atempo->ring - atempo->tail);
00437         nb = FFMIN(nsamples - na, atempo->ring);
00438 
00439         if (na) {
00440             uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
00441             memcpy(a, src, na * atempo->stride);
00442 
00443             src += na * atempo->stride;
00444             atempo->position[0] += na;
00445 
00446             atempo->size = FFMIN(atempo->size + na, atempo->ring);
00447             atempo->tail = (atempo->tail + na) % atempo->ring;
00448             atempo->head =
00449                 atempo->size < atempo->ring ?
00450                 atempo->tail - atempo->size :
00451                 atempo->tail;
00452         }
00453 
00454         if (nb) {
00455             uint8_t *b = atempo->buffer;
00456             memcpy(b, src, nb * atempo->stride);
00457 
00458             src += nb * atempo->stride;
00459             atempo->position[0] += nb;
00460 
00461             atempo->size = FFMIN(atempo->size + nb, atempo->ring);
00462             atempo->tail = (atempo->tail + nb) % atempo->ring;
00463             atempo->head =
00464                 atempo->size < atempo->ring ?
00465                 atempo->tail - atempo->size :
00466                 atempo->tail;
00467         }
00468     }
00469 
00470     // pass back the updated source buffer pointer:
00471     *src_ref = src;
00472 
00473     // sanity check:
00474     av_assert0(atempo->position[0] <= stop_here);
00475 
00476     return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
00477 }
00478 
00486 static int yae_load_frag(ATempoContext *atempo,
00487                          const uint8_t **src_ref,
00488                          const uint8_t *src_end)
00489 {
00490     // shortcuts:
00491     AudioFragment *frag = yae_curr_frag(atempo);
00492     uint8_t *dst;
00493     int64_t missing, start, zeros;
00494     uint32_t nsamples;
00495     const uint8_t *a, *b;
00496     int i0, i1, n0, n1, na, nb;
00497 
00498     int64_t stop_here = frag->position[0] + atempo->window;
00499     if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
00500         return AVERROR(EAGAIN);
00501     }
00502 
00503     // calculate the number of samples we don't have:
00504     missing =
00505         stop_here > atempo->position[0] ?
00506         stop_here - atempo->position[0] : 0;
00507 
00508     nsamples =
00509         missing < (int64_t)atempo->window ?
00510         (uint32_t)(atempo->window - missing) : 0;
00511 
00512     // setup the output buffer:
00513     frag->nsamples = nsamples;
00514     dst = frag->data;
00515 
00516     start = atempo->position[0] - atempo->size;
00517     zeros = 0;
00518 
00519     if (frag->position[0] < start) {
00520         // what we don't have we substitute with zeros:
00521         zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
00522         av_assert0(zeros != nsamples);
00523 
00524         memset(dst, 0, zeros * atempo->stride);
00525         dst += zeros * atempo->stride;
00526     }
00527 
00528     if (zeros == nsamples) {
00529         return 0;
00530     }
00531 
00532     // get the remaining data from the ring buffer:
00533     na = (atempo->head < atempo->tail ?
00534           atempo->tail - atempo->head :
00535           atempo->ring - atempo->head);
00536 
00537     nb = atempo->head < atempo->tail ? 0 : atempo->tail;
00538 
00539     // sanity check:
00540     av_assert0(nsamples <= zeros + na + nb);
00541 
00542     a = atempo->buffer + atempo->head * atempo->stride;
00543     b = atempo->buffer;
00544 
00545     i0 = frag->position[0] + zeros - start;
00546     i1 = i0 < na ? 0 : i0 - na;
00547 
00548     n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
00549     n1 = nsamples - zeros - n0;
00550 
00551     if (n0) {
00552         memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
00553         dst += n0 * atempo->stride;
00554     }
00555 
00556     if (n1) {
00557         memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
00558     }
00559 
00560     return 0;
00561 }
00562 
00566 static void yae_advance_to_next_frag(ATempoContext *atempo)
00567 {
00568     const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
00569 
00570     const AudioFragment *prev;
00571     AudioFragment       *frag;
00572 
00573     atempo->nfrag++;
00574     prev = yae_prev_frag(atempo);
00575     frag = yae_curr_frag(atempo);
00576 
00577     frag->position[0] = prev->position[0] + (int64_t)fragment_step;
00578     frag->position[1] = prev->position[1] + atempo->window / 2;
00579     frag->nsamples    = 0;
00580 }
00581 
00588 static void yae_xcorr_via_rdft(FFTSample *xcorr,
00589                                RDFTContext *complex_to_real,
00590                                const FFTComplex *xa,
00591                                const FFTComplex *xb,
00592                                const int window)
00593 {
00594     FFTComplex *xc = (FFTComplex *)xcorr;
00595     int i;
00596 
00597     // NOTE: first element requires special care -- Given Y = rDFT(X),
00598     // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
00599     // stores Re(Y[N/2]) in place of Im(Y[0]).
00600 
00601     xc->re = xa->re * xb->re;
00602     xc->im = xa->im * xb->im;
00603     xa++;
00604     xb++;
00605     xc++;
00606 
00607     for (i = 1; i < window; i++, xa++, xb++, xc++) {
00608         xc->re = (xa->re * xb->re + xa->im * xb->im);
00609         xc->im = (xa->im * xb->re - xa->re * xb->im);
00610     }
00611 
00612     // apply inverse rDFT:
00613     av_rdft_calc(complex_to_real, xcorr);
00614 }
00615 
00622 static int yae_align(AudioFragment *frag,
00623                      const AudioFragment *prev,
00624                      const int window,
00625                      const int delta_max,
00626                      const int drift,
00627                      FFTSample *correlation,
00628                      RDFTContext *complex_to_real)
00629 {
00630     int       best_offset = -drift;
00631     FFTSample best_metric = -FLT_MAX;
00632     FFTSample *xcorr;
00633 
00634     int i0;
00635     int i1;
00636     int i;
00637 
00638     yae_xcorr_via_rdft(correlation,
00639                        complex_to_real,
00640                        (const FFTComplex *)prev->xdat,
00641                        (const FFTComplex *)frag->xdat,
00642                        window);
00643 
00644     // identify search window boundaries:
00645     i0 = FFMAX(window / 2 - delta_max - drift, 0);
00646     i0 = FFMIN(i0, window);
00647 
00648     i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
00649     i1 = FFMAX(i1, 0);
00650 
00651     // identify cross-correlation peaks within search window:
00652     xcorr = correlation + i0;
00653 
00654     for (i = i0; i < i1; i++, xcorr++) {
00655         FFTSample metric = *xcorr;
00656 
00657         // normalize:
00658         FFTSample drifti = (FFTSample)(drift + i);
00659         metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
00660 
00661         if (metric > best_metric) {
00662             best_metric = metric;
00663             best_offset = i - window / 2;
00664         }
00665     }
00666 
00667     return best_offset;
00668 }
00669 
00676 static int yae_adjust_position(ATempoContext *atempo)
00677 {
00678     const AudioFragment *prev = yae_prev_frag(atempo);
00679     AudioFragment       *frag = yae_curr_frag(atempo);
00680 
00681     const int delta_max  = atempo->window / 2;
00682     const int correction = yae_align(frag,
00683                                      prev,
00684                                      atempo->window,
00685                                      delta_max,
00686                                      atempo->drift,
00687                                      atempo->correlation,
00688                                      atempo->complex_to_real);
00689 
00690     if (correction) {
00691         // adjust fragment position:
00692         frag->position[0] -= correction;
00693 
00694         // clear so that the fragment can be reloaded:
00695         frag->nsamples = 0;
00696 
00697         // update cumulative correction drift counter:
00698         atempo->drift += correction;
00699     }
00700 
00701     return correction;
00702 }
00703 
00708 #define yae_blend(scalar_type)                                          \
00709     do {                                                                \
00710         const scalar_type *aaa = (const scalar_type *)a;                \
00711         const scalar_type *bbb = (const scalar_type *)b;                \
00712                                                                         \
00713         scalar_type *out     = (scalar_type *)dst;                      \
00714         scalar_type *out_end = (scalar_type *)dst_end;                  \
00715         int64_t i;                                                      \
00716                                                                         \
00717         for (i = 0; i < overlap && out < out_end;                       \
00718              i++, atempo->position[1]++, wa++, wb++) {                  \
00719             float w0 = *wa;                                             \
00720             float w1 = *wb;                                             \
00721             int j;                                                      \
00722                                                                         \
00723             for (j = 0; j < atempo->channels;                           \
00724                  j++, aaa++, bbb++, out++) {                            \
00725                 float t0 = (float)*aaa;                                 \
00726                 float t1 = (float)*bbb;                                 \
00727                                                                         \
00728                 *out =                                                  \
00729                     frag->position[0] + i < 0 ?                         \
00730                     *aaa :                                              \
00731                     (scalar_type)(t0 * w0 + t1 * w1);                   \
00732             }                                                           \
00733         }                                                               \
00734         dst = (uint8_t *)out;                                           \
00735     } while (0)
00736 
00745 static int yae_overlap_add(ATempoContext *atempo,
00746                            uint8_t **dst_ref,
00747                            uint8_t *dst_end)
00748 {
00749     // shortcuts:
00750     const AudioFragment *prev = yae_prev_frag(atempo);
00751     const AudioFragment *frag = yae_curr_frag(atempo);
00752 
00753     const int64_t start_here = FFMAX(atempo->position[1],
00754                                      frag->position[1]);
00755 
00756     const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
00757                                     frag->position[1] + frag->nsamples);
00758 
00759     const int64_t overlap = stop_here - start_here;
00760 
00761     const int64_t ia = start_here - prev->position[1];
00762     const int64_t ib = start_here - frag->position[1];
00763 
00764     const float *wa = atempo->hann + ia;
00765     const float *wb = atempo->hann + ib;
00766 
00767     const uint8_t *a = prev->data + ia * atempo->stride;
00768     const uint8_t *b = frag->data + ib * atempo->stride;
00769 
00770     uint8_t *dst = *dst_ref;
00771 
00772     av_assert0(start_here <= stop_here &&
00773                frag->position[1] <= start_here &&
00774                overlap <= frag->nsamples);
00775 
00776     if (atempo->format == AV_SAMPLE_FMT_U8) {
00777         yae_blend(uint8_t);
00778     } else if (atempo->format == AV_SAMPLE_FMT_S16) {
00779         yae_blend(int16_t);
00780     } else if (atempo->format == AV_SAMPLE_FMT_S32) {
00781         yae_blend(int);
00782     } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
00783         yae_blend(float);
00784     } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
00785         yae_blend(double);
00786     }
00787 
00788     // pass-back the updated destination buffer pointer:
00789     *dst_ref = dst;
00790 
00791     return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
00792 }
00793 
00799 static void
00800 yae_apply(ATempoContext *atempo,
00801           const uint8_t **src_ref,
00802           const uint8_t *src_end,
00803           uint8_t **dst_ref,
00804           uint8_t *dst_end)
00805 {
00806     while (1) {
00807         if (atempo->state == YAE_LOAD_FRAGMENT) {
00808             // load additional data for the current fragment:
00809             if (yae_load_frag(atempo, src_ref, src_end) != 0) {
00810                 break;
00811             }
00812 
00813             // down-mix to mono:
00814             yae_downmix(atempo, yae_curr_frag(atempo));
00815 
00816             // apply rDFT:
00817             av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
00818 
00819             // must load the second fragment before alignment can start:
00820             if (!atempo->nfrag) {
00821                 yae_advance_to_next_frag(atempo);
00822                 continue;
00823             }
00824 
00825             atempo->state = YAE_ADJUST_POSITION;
00826         }
00827 
00828         if (atempo->state == YAE_ADJUST_POSITION) {
00829             // adjust position for better alignment:
00830             if (yae_adjust_position(atempo)) {
00831                 // reload the fragment at the corrected position, so that the
00832                 // Hann window blending would not require normalization:
00833                 atempo->state = YAE_RELOAD_FRAGMENT;
00834             } else {
00835                 atempo->state = YAE_OUTPUT_OVERLAP_ADD;
00836             }
00837         }
00838 
00839         if (atempo->state == YAE_RELOAD_FRAGMENT) {
00840             // load additional data if necessary due to position adjustment:
00841             if (yae_load_frag(atempo, src_ref, src_end) != 0) {
00842                 break;
00843             }
00844 
00845             // down-mix to mono:
00846             yae_downmix(atempo, yae_curr_frag(atempo));
00847 
00848             // apply rDFT:
00849             av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
00850 
00851             atempo->state = YAE_OUTPUT_OVERLAP_ADD;
00852         }
00853 
00854         if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
00855             // overlap-add and output the result:
00856             if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
00857                 break;
00858             }
00859 
00860             // advance to the next fragment, repeat:
00861             yae_advance_to_next_frag(atempo);
00862             atempo->state = YAE_LOAD_FRAGMENT;
00863         }
00864     }
00865 }
00866 
00874 static int yae_flush(ATempoContext *atempo,
00875                      uint8_t **dst_ref,
00876                      uint8_t *dst_end)
00877 {
00878     AudioFragment *frag = yae_curr_frag(atempo);
00879     int64_t overlap_end;
00880     int64_t start_here;
00881     int64_t stop_here;
00882     int64_t offset;
00883 
00884     const uint8_t *src;
00885     uint8_t *dst;
00886 
00887     int src_size;
00888     int dst_size;
00889     int nbytes;
00890 
00891     atempo->state = YAE_FLUSH_OUTPUT;
00892 
00893     if (atempo->position[0] == frag->position[0] + frag->nsamples &&
00894         atempo->position[1] == frag->position[1] + frag->nsamples) {
00895         // the current fragment is already flushed:
00896         return 0;
00897     }
00898 
00899     if (frag->position[0] + frag->nsamples < atempo->position[0]) {
00900         // finish loading the current (possibly partial) fragment:
00901         yae_load_frag(atempo, NULL, NULL);
00902 
00903         if (atempo->nfrag) {
00904             // down-mix to mono:
00905             yae_downmix(atempo, frag);
00906 
00907             // apply rDFT:
00908             av_rdft_calc(atempo->real_to_complex, frag->xdat);
00909 
00910             // align current fragment to previous fragment:
00911             if (yae_adjust_position(atempo)) {
00912                 // reload the current fragment due to adjusted position:
00913                 yae_load_frag(atempo, NULL, NULL);
00914             }
00915         }
00916     }
00917 
00918     // flush the overlap region:
00919     overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
00920                                             frag->nsamples);
00921 
00922     while (atempo->position[1] < overlap_end) {
00923         if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
00924             return AVERROR(EAGAIN);
00925         }
00926     }
00927 
00928     // flush the remaininder of the current fragment:
00929     start_here = FFMAX(atempo->position[1], overlap_end);
00930     stop_here  = frag->position[1] + frag->nsamples;
00931     offset     = start_here - frag->position[1];
00932     av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
00933 
00934     src = frag->data + offset * atempo->stride;
00935     dst = (uint8_t *)*dst_ref;
00936 
00937     src_size = (int)(stop_here - start_here) * atempo->stride;
00938     dst_size = dst_end - dst;
00939     nbytes = FFMIN(src_size, dst_size);
00940 
00941     memcpy(dst, src, nbytes);
00942     dst += nbytes;
00943 
00944     atempo->position[1] += (nbytes / atempo->stride);
00945 
00946     // pass-back the updated destination buffer pointer:
00947     *dst_ref = (uint8_t *)dst;
00948 
00949     return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
00950 }
00951 
00952 static av_cold int init(AVFilterContext *ctx, const char *args)
00953 {
00954     ATempoContext *atempo = ctx->priv;
00955 
00956     // NOTE: this assumes that the caller has memset ctx->priv to 0:
00957     atempo->format = AV_SAMPLE_FMT_NONE;
00958     atempo->tempo  = 1.0;
00959     atempo->state  = YAE_LOAD_FRAGMENT;
00960 
00961     return args ? yae_set_tempo(ctx, args) : 0;
00962 }
00963 
00964 static av_cold void uninit(AVFilterContext *ctx)
00965 {
00966     ATempoContext *atempo = ctx->priv;
00967     yae_release_buffers(atempo);
00968 }
00969 
00970 static int query_formats(AVFilterContext *ctx)
00971 {
00972     AVFilterChannelLayouts *layouts = NULL;
00973     AVFilterFormats        *formats = NULL;
00974 
00975     // WSOLA necessitates an internal sliding window ring buffer
00976     // for incoming audio stream.
00977     //
00978     // Planar sample formats are too cumbersome to store in a ring buffer,
00979     // therefore planar sample formats are not supported.
00980     //
00981     enum AVSampleFormat sample_fmts[] = {
00982         AV_SAMPLE_FMT_U8,
00983         AV_SAMPLE_FMT_S16,
00984         AV_SAMPLE_FMT_S32,
00985         AV_SAMPLE_FMT_FLT,
00986         AV_SAMPLE_FMT_DBL,
00987         AV_SAMPLE_FMT_NONE
00988     };
00989 
00990     layouts = ff_all_channel_layouts();
00991     if (!layouts) {
00992         return AVERROR(ENOMEM);
00993     }
00994     ff_set_common_channel_layouts(ctx, layouts);
00995 
00996     formats = ff_make_format_list(sample_fmts);
00997     if (!formats) {
00998         return AVERROR(ENOMEM);
00999     }
01000     ff_set_common_formats(ctx, formats);
01001 
01002     formats = ff_all_samplerates();
01003     if (!formats) {
01004         return AVERROR(ENOMEM);
01005     }
01006     ff_set_common_samplerates(ctx, formats);
01007 
01008     return 0;
01009 }
01010 
01011 static int config_props(AVFilterLink *inlink)
01012 {
01013     AVFilterContext  *ctx = inlink->dst;
01014     ATempoContext *atempo = ctx->priv;
01015 
01016     enum AVSampleFormat format = inlink->format;
01017     int sample_rate = (int)inlink->sample_rate;
01018     int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
01019 
01020     return yae_reset(atempo, format, sample_rate, channels);
01021 }
01022 
01023 static void push_samples(ATempoContext *atempo,
01024                          AVFilterLink *outlink,
01025                          int n_out)
01026 {
01027     atempo->dst_buffer->audio->sample_rate = outlink->sample_rate;
01028     atempo->dst_buffer->audio->nb_samples  = n_out;
01029 
01030     // adjust the PTS:
01031     atempo->dst_buffer->pts =
01032         av_rescale_q(atempo->nsamples_out,
01033                      (AVRational){ 1, outlink->sample_rate },
01034                      outlink->time_base);
01035 
01036     ff_filter_samples(outlink, atempo->dst_buffer);
01037     atempo->dst_buffer = NULL;
01038     atempo->dst        = NULL;
01039     atempo->dst_end    = NULL;
01040 
01041     atempo->nsamples_out += n_out;
01042 }
01043 
01044 static int filter_samples(AVFilterLink *inlink,
01045                            AVFilterBufferRef *src_buffer)
01046 {
01047     AVFilterContext  *ctx = inlink->dst;
01048     ATempoContext *atempo = ctx->priv;
01049     AVFilterLink *outlink = ctx->outputs[0];
01050 
01051     int n_in = src_buffer->audio->nb_samples;
01052     int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
01053 
01054     const uint8_t *src = src_buffer->data[0];
01055     const uint8_t *src_end = src + n_in * atempo->stride;
01056 
01057     while (src < src_end) {
01058         if (!atempo->dst_buffer) {
01059             atempo->dst_buffer = ff_get_audio_buffer(outlink,
01060                                                      AV_PERM_WRITE,
01061                                                      n_out);
01062             avfilter_copy_buffer_ref_props(atempo->dst_buffer, src_buffer);
01063 
01064             atempo->dst = atempo->dst_buffer->data[0];
01065             atempo->dst_end = atempo->dst + n_out * atempo->stride;
01066         }
01067 
01068         yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
01069 
01070         if (atempo->dst == atempo->dst_end) {
01071             push_samples(atempo, outlink, n_out);
01072             atempo->request_fulfilled = 1;
01073         }
01074     }
01075 
01076     atempo->nsamples_in += n_in;
01077     avfilter_unref_bufferp(&src_buffer);
01078     return 0;
01079 }
01080 
01081 static int request_frame(AVFilterLink *outlink)
01082 {
01083     AVFilterContext  *ctx = outlink->src;
01084     ATempoContext *atempo = ctx->priv;
01085     int ret;
01086 
01087     atempo->request_fulfilled = 0;
01088     do {
01089         ret = ff_request_frame(ctx->inputs[0]);
01090     }
01091     while (!atempo->request_fulfilled && ret >= 0);
01092 
01093     if (ret == AVERROR_EOF) {
01094         // flush the filter:
01095         int n_max = atempo->ring;
01096         int n_out;
01097         int err = AVERROR(EAGAIN);
01098 
01099         while (err == AVERROR(EAGAIN)) {
01100             if (!atempo->dst_buffer) {
01101                 atempo->dst_buffer = ff_get_audio_buffer(outlink,
01102                                                          AV_PERM_WRITE,
01103                                                          n_max);
01104 
01105                 atempo->dst = atempo->dst_buffer->data[0];
01106                 atempo->dst_end = atempo->dst + n_max * atempo->stride;
01107             }
01108 
01109             err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
01110 
01111             n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
01112                      atempo->stride);
01113 
01114             if (n_out) {
01115                 push_samples(atempo, outlink, n_out);
01116             }
01117         }
01118 
01119         avfilter_unref_bufferp(&atempo->dst_buffer);
01120         atempo->dst     = NULL;
01121         atempo->dst_end = NULL;
01122 
01123         return AVERROR_EOF;
01124     }
01125 
01126     return ret;
01127 }
01128 
01129 static int process_command(AVFilterContext *ctx,
01130                            const char *cmd,
01131                            const char *arg,
01132                            char *res,
01133                            int res_len,
01134                            int flags)
01135 {
01136     return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
01137 }
01138 
01139 AVFilter avfilter_af_atempo = {
01140     .name            = "atempo",
01141     .description     = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
01142     .init            = init,
01143     .uninit          = uninit,
01144     .query_formats   = query_formats,
01145     .process_command = process_command,
01146     .priv_size       = sizeof(ATempoContext),
01147 
01148     .inputs    = (const AVFilterPad[]) {
01149         { .name            = "default",
01150           .type            = AVMEDIA_TYPE_AUDIO,
01151           .filter_samples  = filter_samples,
01152           .config_props    = config_props,
01153           .min_perms       = AV_PERM_READ, },
01154         { .name = NULL}
01155     },
01156 
01157     .outputs   = (const AVFilterPad[]) {
01158         { .name            = "default",
01159           .request_frame   = request_frame,
01160           .type            = AVMEDIA_TYPE_AUDIO, },
01161         { .name = NULL}
01162     },
01163 };