doxygen/trunk/af__whisper_8c_source.html

/*

 * Copyright (c) 2025 Vittorio Palmisano

 *

 * This file is part of FFmpeg.

 *

 * FFmpeg is free software; you can redistribute it and/or

 * modify it under the terms of the GNU Lesser General Public License

 * as published by the Free Software Foundation; either

 * version 2.1 of the License, or (at your option) any later version.

 *

 * FFmpeg is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 * GNU Lesser General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public License

 * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,

 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

 */


#include <stdio.h>

#include <stdint.h>

#include <stdlib.h>


#include <whisper.h>


#include "libavutil/avutil.h"

#include "libavutil/opt.h"

#include "libavutil/channel_layout.h"

#include "libavutil/samplefmt.h"

#include "libavfilter/avfilter.h"

#include "libavfilter/audio.h"

#include "libavutil/mem.h"

#include "libavutil/avstring.h"

#include "libavutil/internal.h"

#include "libavformat/avio.h"

#include "libavutil/thread.h"


#include "formats.h"


typedef struct WhisperContext {

    const AVClass *class;

    char *model_path;

    const char *language;

    char *language_str;

    bool translate;

    bool use_gpu;

    int gpu_device;

    char *vad_model_path;

    float vad_threshold;

    int64_t vad_min_speech_duration;

    int64_t vad_min_silence_duration;


    int64_t queue;

    char *destination;

    char *format;

    int max_len;


    struct whisper_context *ctx_wsp;

    struct whisper_vad_context *ctx_vad;

    struct whisper_vad_params vad_params;


    float *audio_buffer;

    int audio_buffer_queue_size;

    int audio_buffer_fill_size;

    int audio_buffer_vad_size;

    int64_t audio_buffer_start_ms;


    int eof;

    int64_t next_pts;


    AVIOContext *avio_context;

    int index;

} WhisperContext;


static void cb_log(enum ggml_log_level level, const char *text, void *user_data)

{

    AVFilterContext *ctx = user_data;

    int av_log_level = AV_LOG_DEBUG;

    switch (level) {

    case GGML_LOG_LEVEL_ERROR:

        av_log_level = AV_LOG_ERROR;

        break;

    case GGML_LOG_LEVEL_WARN:

        av_log_level = AV_LOG_WARNING;

        break;

    }

    av_log(ctx, av_log_level, "%s", text);

}


static int init(AVFilterContext *ctx)

{

    WhisperContext *wctx = ctx->priv;


    static AVOnce init_static_once = AV_ONCE_INIT;

    ff_thread_once(&init_static_once, ggml_backend_load_all);


    whisper_log_set(cb_log, ctx);


    // Init whisper context

    if (!wctx->model_path) {

        av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");

        return AVERROR(EINVAL);

    }


    struct whisper_context_params params = whisper_context_default_params();

    params.use_gpu = wctx->use_gpu;

    params.gpu_device = wctx->gpu_device;


    wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);

    if (wctx->ctx_wsp == NULL) {

        av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);

        return AVERROR(EIO);

    }


    // Init buffer

    wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);

    wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));

    if (!wctx->audio_buffer)

        return AVERROR(ENOMEM);


    // Init VAD model context

    if (wctx->vad_model_path) {

        struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();

        ctx_params.n_threads = ff_filter_get_nb_threads(ctx);

        // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)

        ctx_params.gpu_device = wctx->gpu_device;

        wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);


        wctx->vad_params = whisper_vad_default_params();

        wctx->vad_params.threshold = wctx->vad_threshold;

        wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);

        wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);

        wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);

        wctx->vad_params.speech_pad_ms = 0;

        wctx->vad_params.samples_overlap = 0;

    }


    wctx->next_pts = AV_NOPTS_VALUE;


    if (wctx->destination && strcmp("", wctx->destination)) {

        const char *dst = wctx->destination;

        if (!strcmp("-", dst))

            dst = "pipe:1";

        int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);


        if (ret < 0) {

            av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));

            return ret;

        }


        wctx->avio_context->direct = AVIO_FLAG_DIRECT;

    }


    if (!whisper_is_multilingual(wctx->ctx_wsp)) {

        if (!wctx->translate && strcmp(wctx->language_str, "auto") == 0) {

            av_log(ctx, AV_LOG_WARNING,

                   "Multilingual model not provided. Non-English audio may not be correctly transcribed.\n");

        } else if (wctx->translate || (strcmp(wctx->language_str, "auto") != 0 && strcmp(wctx->language_str, "en") != 0)) {

            av_log(ctx, AV_LOG_ERROR,

                   "%s requested but multilingual model not provided.\n", wctx->translate ? "Translation" : "Transcription");

            return AVERROR(ENOSYS);

        }

        wctx->language = "en";

    } else

        wctx->language = wctx->language_str;


    av_log(ctx, AV_LOG_INFO,

           "Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n",

           wctx->model_path, wctx->language, wctx->queue / 1000);


    return 0;

}


static void uninit(AVFilterContext *ctx)

{

    WhisperContext *wctx = ctx->priv;


    if (wctx->audio_buffer_fill_size > 0) {

        av_log(ctx, AV_LOG_WARNING,

               "Remaining audio buffer %d samples (%d seconds) after stopping\n",

               wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);

    }


    if (wctx->ctx_vad) {

        whisper_vad_free(wctx->ctx_vad);

        wctx->ctx_vad = NULL;

    }


    if (wctx->ctx_wsp) {

        whisper_free(wctx->ctx_wsp);

        wctx->ctx_wsp = NULL;

    }


    av_freep(&wctx->audio_buffer);


    if (wctx->avio_context)

        avio_closep(&wctx->avio_context);

}


static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)

{

    WhisperContext *wctx = ctx->priv;

    samples = FFMAX(0, FFMIN(samples, wctx->audio_buffer_fill_size));


    if (!wctx->ctx_wsp || samples == 0)

        return;


    const int64_t timestamp_ms = wctx->audio_buffer_start_ms;

    const float duration = (float) samples / WHISPER_SAMPLE_RATE;


    av_log(ctx, AV_LOG_INFO,

           "run transcription at %" PRId64 " ms, %d/%d samples (%.2f seconds)...\n",

           timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);


    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

    params.language = wctx->language;

    params.translate = wctx->translate;

    params.n_threads = ff_filter_get_nb_threads(ctx);

    params.print_special = 0;

    params.print_progress = 0;

    params.print_realtime = 0;

    params.print_timestamps = 0;

    params.max_len = wctx->max_len;

    params.token_timestamps = (wctx->max_len > 0);

    params.split_on_word = (wctx->max_len > 0);


    if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {

        av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");

        return;

    }


    const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);

    char *segments_text = NULL;


    for (int i = 0; i < n_segments; ++i) {

        const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);

        if (av_isspace(text[0]))

            text++;

        char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");


        if (av_strnlen(text_cleaned, 1) == 0) {

            av_freep(&text_cleaned);

            continue;

        }


        // Skip segments that are parts of [BLANK_AUDIO] when max_len splits them

        if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || strcmp(text_cleaned, "]") == 0 ||

                                  strcmp(text_cleaned, "BLANK") == 0 || strcmp(text_cleaned, "_") == 0 ||

                                  strcmp(text_cleaned, "AUDIO") == 0)) {

            av_freep(&text_cleaned);

            continue;

        }


        const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);

        const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;

        const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;


        av_log(ctx, AV_LOG_DEBUG, "  [%" PRId64 "-%" PRId64 "%s]: \"%s\"\n",

               timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);


        if (segments_text) {

            char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);

            av_freep(&segments_text);

            segments_text = new_text;

        } else

            segments_text = av_strdup(text_cleaned);


        if (wctx->avio_context) {

            const int64_t start_t = timestamp_ms + t0_ms;

            const int64_t end_t = timestamp_ms + t1_ms;

            char *buf = NULL;


            if (!av_strcasecmp(wctx->format, "srt")) {

                buf =

                    av_asprintf

                    ("%d\n%02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 " --> %02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 "\n%s\n\n",

                     wctx->index, start_t / 3600000,

                     (start_t / 60000) % 60, (start_t / 1000) % 60,

                     start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,

                     (end_t / 1000) % 60, end_t % 1000, text_cleaned);


                wctx->index++;

            } else if (!av_strcasecmp(wctx->format, "json")) {

                buf = av_asprintf("{\"start\":%" PRId64 ",\"end\":%" PRId64 ",\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);

            } else

                buf = av_asprintf("%s\n", text_cleaned);


            if (buf) {

                avio_write(wctx->avio_context, buf, strlen(buf));

                av_freep(&buf);

            }

        }


        av_freep(&text_cleaned);

    }


    AVDictionary **metadata = &frame->metadata;

    if (metadata && segments_text) {

        av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);

        char *duration_text = av_asprintf("%f", duration);

        av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);

    }

    av_freep(&segments_text);


    if (wctx->audio_buffer_fill_size > samples) {

        memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,

               (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));

        wctx->audio_buffer_start_ms += duration * 1000;

    }

    wctx->audio_buffer_fill_size -= samples;

    wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;

}


static int filter_frame(AVFilterLink *inlink, AVFrame *frame)

{

    AVFilterContext *ctx = inlink->dst;

    WhisperContext *wctx = ctx->priv;

    AVFilterLink *outlink = ctx->outputs[0];


    const int samples = frame->nb_samples;

    const float *input_data = (const float *) frame->data[0];


    if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {

        run_transcription(ctx, frame, wctx->audio_buffer_fill_size);

    }


    if (!wctx->audio_buffer_fill_size)

        wctx->audio_buffer_start_ms = av_rescale_q(frame->pts,

                                                   (AVRational) {1000, 1},

                                                   (AVRational) {inlink->time_base.den, inlink->time_base.num});

    memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));

    wctx->audio_buffer_fill_size += samples;


    if (wctx->ctx_vad

        && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=

        av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {

        struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,

                                                                                  wctx->vad_params,

                                                                                  wctx->audio_buffer,

                                                                                  wctx->audio_buffer_fill_size);

        wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;


        if (!segments) {

            av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");

        } else {

            int n_segments = whisper_vad_segments_n_segments(segments);


            if (n_segments > 0) {

                const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;

                const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;

                int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);


                if (end_pos <= wctx->audio_buffer_fill_size -

                    av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {

                    av_log(ctx, AV_LOG_INFO,

                            "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",

                            n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);

                    run_transcription(ctx, frame, end_pos);

                }

            }


            whisper_vad_free_segments(segments);

        }

    } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)

        run_transcription(ctx, frame, wctx->audio_buffer_fill_size);


    wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {

                                               1, inlink->sample_rate}

                                               , inlink->time_base);

    return ff_filter_frame(outlink, frame);

}


static int push_last_frame(AVFilterLink *outlink)

{

    AVFilterContext *ctx = outlink->src;

    WhisperContext *wctx = ctx->priv;

    AVFrame *frame;

    int n_out = 1;


    if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)

        return 0;

    frame = ff_get_audio_buffer(outlink, n_out);

    if (!frame)

        return AVERROR(ENOMEM);


    av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);


    frame->pts = wctx->next_pts;

    if (wctx->next_pts != AV_NOPTS_VALUE)

        wctx->next_pts += av_rescale_q(n_out, (AVRational) {

                                       1, outlink->sample_rate}

                                       , outlink->time_base);


    run_transcription(ctx, frame, wctx->audio_buffer_fill_size);


    return ff_filter_frame(outlink, frame);

}


static int activate(AVFilterContext *ctx)

{

    AVFilterLink *inlink = ctx->inputs[0];

    AVFilterLink *outlink = ctx->outputs[0];

    WhisperContext *wctx = ctx->priv;

    int64_t pts;

    int status;


    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);


    if (!wctx->eof && ff_inlink_queued_frames(inlink)) {

        AVFrame *frame = NULL;

        int ret;


        ret = ff_inlink_consume_frame(inlink, &frame);

        if (ret < 0)

            return ret;

        if (ret > 0)

            return filter_frame(inlink, frame);

    }


    if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))

        wctx->eof = status == AVERROR_EOF;


    if (wctx->eof) {

        push_last_frame(outlink);


        ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);

        return 0;

    }


    FF_FILTER_FORWARD_WANTED(outlink, inlink);


    return FFERROR_NOT_READY;

}


static int query_formats(const AVFilterContext *ctx,

                         AVFilterFormatsConfig **cfg_in,

                         AVFilterFormatsConfig **cfg_out)

{

    static const enum AVSampleFormat sample_fmts[] = { AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE };

    AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };

    int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };

    int ret;


    ret = ff_set_sample_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);

    if (ret < 0)

        return ret;


    ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);

    if (ret < 0)

        return ret;


    return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);

}


#define OFFSET(x) offsetof(WhisperContext, x)

#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM

#define HOURS 3600000000


static const AVOption whisper_options[] = {

    { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },

    { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language_str), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },

    { "translate", "Translate from source language to English", OFFSET(translate), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, .flags = FLAGS },

    { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },

    { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },

    { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },

    { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },

    { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },

    { "max_len", "Max segment length in characters", OFFSET(max_len), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },

    { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },

    { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },

    { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },

    { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },

    { NULL }

};


static const AVClass whisper_class = {

    .class_name = "whisper",

    .item_name = av_default_item_name,

    .option = whisper_options,

    .version = LIBAVUTIL_VERSION_INT,

};


const FFFilter ff_af_whisper = {

    .p.name = "whisper",

    .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),

    .p.priv_class = &whisper_class,

    .p.flags = AVFILTER_FLAG_METADATA_ONLY,

    .init = init,

    .uninit = uninit,

    .activate = activate,

    .priv_size = sizeof(WhisperContext),

    FILTER_INPUTS(ff_audio_default_filterpad),

    FILTER_OUTPUTS(ff_audio_default_filterpad),

    FILTER_QUERY_FUNC2(query_formats),

};