FFmpeg
af_whisper.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025 Vittorio Palmisano
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public License
8  * as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <stdio.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 
25 #include <whisper.h>
26 
27 #include "libavutil/avutil.h"
28 #include "libavutil/opt.h"
30 #include "libavutil/samplefmt.h"
31 #include "libavfilter/avfilter.h"
32 #include "libavfilter/audio.h"
33 #include "libavutil/mem.h"
34 #include "libavutil/avstring.h"
35 #include "libavutil/internal.h"
36 #include "libavformat/avio.h"
37 #include "libavutil/thread.h"
38 
39 #include "formats.h"
40 
41 typedef struct WhisperContext {
42  const AVClass *class;
43  char *model_path;
44  char *language;
45  bool use_gpu;
51 
53  char *destination;
54  char *format;
55  int max_len;
56 
57  struct whisper_context *ctx_wsp;
58  struct whisper_vad_context *ctx_vad;
59  struct whisper_vad_params vad_params;
60 
61  float *audio_buffer;
66 
67  int eof;
69 
71  int index;
73 
74 static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
75 {
78  switch (level) {
79  case GGML_LOG_LEVEL_ERROR:
81  break;
82  case GGML_LOG_LEVEL_WARN:
84  break;
85  }
86  av_log(ctx, av_log_level, "%s", text);
87 }
88 
89 static int init(AVFilterContext *ctx)
90 {
91  WhisperContext *wctx = ctx->priv;
92 
93  static AVOnce init_static_once = AV_ONCE_INIT;
94  ff_thread_once(&init_static_once, ggml_backend_load_all);
95 
96  whisper_log_set(cb_log, ctx);
97 
98  // Init whisper context
99  if (!wctx->model_path) {
100  av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
101  return AVERROR(EINVAL);
102  }
103 
104  struct whisper_context_params params = whisper_context_default_params();
105  params.use_gpu = wctx->use_gpu;
106  params.gpu_device = wctx->gpu_device;
107 
108  wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
109  if (wctx->ctx_wsp == NULL) {
110  av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
111  return AVERROR(EIO);
112  }
113 
114  // Init buffer
115  wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);
116  wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));
117  if (!wctx->audio_buffer)
118  return AVERROR(ENOMEM);
119 
120  // Init VAD model context
121  if (wctx->vad_model_path) {
122  struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
123  ctx_params.n_threads = ff_filter_get_nb_threads(ctx);
124  // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
125  ctx_params.gpu_device = wctx->gpu_device;
126  wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);
127 
128  wctx->vad_params = whisper_vad_default_params();
129  wctx->vad_params.threshold = wctx->vad_threshold;
130  wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);
131  wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);
132  wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);
133  wctx->vad_params.speech_pad_ms = 0;
134  wctx->vad_params.samples_overlap = 0;
135  }
136 
137  wctx->next_pts = AV_NOPTS_VALUE;
138 
139  if (wctx->destination && strcmp("", wctx->destination)) {
140  const char *dst = wctx->destination;
141  if (!strcmp("-", dst))
142  dst = "pipe:1";
143  int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);
144 
145  if (ret < 0) {
146  av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));
147  return ret;
148  }
149 
151  }
152 
154  "Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n",
155  wctx->model_path, wctx->language, wctx->queue / 1000);
156 
157  return 0;
158 }
159 
161 {
162  WhisperContext *wctx = ctx->priv;
163 
164  if (wctx->audio_buffer_fill_size > 0) {
166  "Remaining audio buffer %d samples (%d seconds) after stopping\n",
167  wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
168  }
169 
170  if (wctx->ctx_vad) {
171  whisper_vad_free(wctx->ctx_vad);
172  wctx->ctx_vad = NULL;
173  }
174 
175  if (wctx->ctx_wsp) {
176  whisper_free(wctx->ctx_wsp);
177  wctx->ctx_wsp = NULL;
178  }
179 
180  av_freep(&wctx->audio_buffer);
181 
182  if (wctx->avio_context)
183  avio_closep(&wctx->avio_context);
184 }
185 
187 {
188  WhisperContext *wctx = ctx->priv;
190 
191  if (!wctx->ctx_wsp || samples == 0)
192  return;
193 
194  const int64_t timestamp_ms = wctx->audio_buffer_start_ms;
195  const float duration = (float) samples / WHISPER_SAMPLE_RATE;
196 
198  "run transcription at %" PRId64 " ms, %d/%d samples (%.2f seconds)...\n",
199  timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);
200 
201  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
202  params.language = wctx->language;
203  params.n_threads = ff_filter_get_nb_threads(ctx);
204  params.print_special = 0;
205  params.print_progress = 0;
206  params.print_realtime = 0;
207  params.print_timestamps = 0;
208  params.max_len = wctx->max_len;
209  params.token_timestamps = (wctx->max_len > 0);
210 
211  if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {
212  av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");
213  return;
214  }
215 
216  const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);
217  char *segments_text = NULL;
218 
219  for (int i = 0; i < n_segments; ++i) {
220  const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);
221  if (av_isspace(text[0]))
222  text++;
223  char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");
224 
225  if (av_strnlen(text_cleaned, 1) == 0) {
226  av_freep(&text_cleaned);
227  continue;
228  }
229 
230  // Skip segments that are parts of [BLANK_AUDIO] when max_len splits them
231  if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || strcmp(text_cleaned, "]") == 0 ||
232  strcmp(text_cleaned, "BLANK") == 0 || strcmp(text_cleaned, "_") == 0 ||
233  strcmp(text_cleaned, "AUDIO") == 0)) {
234  av_freep(&text_cleaned);
235  continue;
236  }
237 
238  const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
239  const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
240  const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;
241 
242  av_log(ctx, AV_LOG_DEBUG, " [%" PRId64 "-%" PRId64 "%s]: \"%s\"\n",
243  timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);
244 
245  if (segments_text) {
246  char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);
247  av_freep(&segments_text);
248  segments_text = new_text;
249  } else
250  segments_text = av_strdup(text_cleaned);
251 
252  if (wctx->avio_context) {
253  const int64_t start_t = timestamp_ms + t0_ms;
254  const int64_t end_t = timestamp_ms + t1_ms;
255  char *buf = NULL;
256 
257  if (!av_strcasecmp(wctx->format, "srt")) {
258  buf =
260  ("%d\n%02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 " --> %02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 "\n%s\n\n",
261  wctx->index, start_t / 3600000,
262  (start_t / 60000) % 60, (start_t / 1000) % 60,
263  start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
264  (end_t / 1000) % 60, end_t % 1000, text_cleaned);
265 
266  wctx->index++;
267  } else if (!av_strcasecmp(wctx->format, "json")) {
268  buf = av_asprintf("{\"start\":%" PRId64 ",\"end\":%" PRId64 ",\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
269  } else
270  buf = av_asprintf("%s\n", text_cleaned);
271 
272  if (buf) {
273  avio_write(wctx->avio_context, buf, strlen(buf));
274  av_freep(&buf);
275  }
276  }
277 
278  av_freep(&text_cleaned);
279  }
280 
281  AVDictionary **metadata = &frame->metadata;
282  if (metadata && segments_text) {
283  av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
284  char *duration_text = av_asprintf("%f", duration);
285  av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
286  }
287  av_freep(&segments_text);
288 
289  if (wctx->audio_buffer_fill_size > samples) {
290  memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,
291  (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));
292  wctx->audio_buffer_start_ms += duration * 1000;
293  }
296 }
297 
299 {
300  AVFilterContext *ctx = inlink->dst;
301  WhisperContext *wctx = ctx->priv;
302  AVFilterLink *outlink = ctx->outputs[0];
303 
304  const int samples = frame->nb_samples;
305  const float *input_data = (const float *) frame->data[0];
306 
309  }
310 
311  if (!wctx->audio_buffer_fill_size)
313  (AVRational) {1000, 1},
314  (AVRational) {inlink->time_base.den, inlink->time_base.num});
315  memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
317 
318  if (wctx->ctx_vad
319  && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
320  av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
321  struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
322  wctx->vad_params,
323  wctx->audio_buffer,
324  wctx->audio_buffer_fill_size);
326 
327  if (!segments) {
328  av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
329  } else {
330  int n_segments = whisper_vad_segments_n_segments(segments);
331 
332  if (n_segments > 0) {
333  const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
334  const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
335  int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
336 
337  if (end_pos <= wctx->audio_buffer_fill_size -
338  av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
340  "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
341  n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
342  run_transcription(ctx, frame, end_pos);
343  }
344  }
345 
346  whisper_vad_free_segments(segments);
347  }
348  } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
350 
351  wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {
352  1, inlink->sample_rate}
353  , inlink->time_base);
354  return ff_filter_frame(outlink, frame);
355 }
356 
357 static int push_last_frame(AVFilterLink *outlink)
358 {
359  AVFilterContext *ctx = outlink->src;
360  WhisperContext *wctx = ctx->priv;
361  AVFrame *frame;
362  int n_out = 1;
363 
364  if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)
365  return 0;
366  frame = ff_get_audio_buffer(outlink, n_out);
367  if (!frame)
368  return AVERROR(ENOMEM);
369 
370  av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);
371 
372  frame->pts = wctx->next_pts;
373  if (wctx->next_pts != AV_NOPTS_VALUE)
374  wctx->next_pts += av_rescale_q(n_out, (AVRational) {
375  1, outlink->sample_rate}
376  , outlink->time_base);
377 
379 
380  return ff_filter_frame(outlink, frame);
381 }
382 
384 {
385  AVFilterLink *inlink = ctx->inputs[0];
386  AVFilterLink *outlink = ctx->outputs[0];
387  WhisperContext *wctx = ctx->priv;
388  int64_t pts;
389  int status;
390 
392 
393  if (!wctx->eof && ff_inlink_queued_frames(inlink)) {
394  AVFrame *frame = NULL;
395  int ret;
396 
398  if (ret < 0)
399  return ret;
400  if (ret > 0)
401  return filter_frame(inlink, frame);
402  }
403 
404  if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))
405  wctx->eof = status == AVERROR_EOF;
406 
407  if (wctx->eof) {
408  push_last_frame(outlink);
409 
410  ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);
411  return 0;
412  }
413 
415 
416  return FFERROR_NOT_READY;
417 }
418 
420  AVFilterFormatsConfig **cfg_in,
421  AVFilterFormatsConfig **cfg_out)
422 {
424  AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };
425  int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };
426  int ret;
427 
429  if (ret < 0)
430  return ret;
431 
432  ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);
433  if (ret < 0)
434  return ret;
435 
436  return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
437 }
438 
439 #define OFFSET(x) offsetof(WhisperContext, x)
440 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
441 #define HOURS 3600000000
442 
443 static const AVOption whisper_options[] = {
444  { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
445  { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
446  { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
447  { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
448  { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
449  { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
450  { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
451  { "max_len", "Max segment length in characters", OFFSET(max_len), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
452  { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
453  { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
454  { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
455  { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },
456  { NULL }
457 };
458 
459 static const AVClass whisper_class = {
460  .class_name = "whisper",
461  .item_name = av_default_item_name,
462  .option = whisper_options,
463  .version = LIBAVUTIL_VERSION_INT,
464 };
465 
467  .p.name = "whisper",
468  .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
469  .p.priv_class = &whisper_class,
470  .p.flags = AVFILTER_FLAG_METADATA_ONLY,
471  .init = init,
472  .uninit = uninit,
473  .activate = activate,
474  .priv_size = sizeof(WhisperContext),
478 };
ff_get_audio_buffer
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:98
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:216
level
uint8_t level
Definition: svq3.c:208
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
whisper_options
static const AVOption whisper_options[]
Definition: af_whisper.c:443
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1067
sample_fmts
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:931
thread.h
AVERROR_EOF
#define AVERROR_EOF
End of file.
Definition: error.h:57
FFERROR_NOT_READY
return FFERROR_NOT_READY
Definition: filter_design.txt:204
int64_t
long long int64_t
Definition: coverity.c:34
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
av_asprintf
char * av_asprintf(const char *fmt,...)
Definition: avstring.c:115
av_strcasecmp
int av_strcasecmp(const char *a, const char *b)
Locale-independent case-insensitive compare.
Definition: avstring.c:208
av_isspace
static av_const int av_isspace(int c)
Locale-independent conversion of ASCII isspace.
Definition: avstring.h:218
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: filters.h:264
sample_rates
static const int sample_rates[]
Definition: dcaenc.h:34
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:427
WhisperContext::audio_buffer_vad_size
int audio_buffer_vad_size
Definition: af_whisper.c:64
av_samples_set_silence
int av_samples_set_silence(uint8_t *const *audio_data, int offset, int nb_samples, int nb_channels, enum AVSampleFormat sample_fmt)
Fill an audio buffer with silence.
Definition: samplefmt.c:246
AVOption
AVOption.
Definition: opt.h:429
WhisperContext::language
char * language
Definition: af_whisper.c:44
avio_open
int avio_open(AVIOContext **s, const char *filename, int flags)
Create and initialize a AVIOContext for accessing the resource indicated by url.
Definition: avio.c:498
AV_OPT_TYPE_DURATION
@ AV_OPT_TYPE_DURATION
Underlying C type is int64_t.
Definition: opt.h:319
WhisperContext
Definition: af_whisper.c:41
ff_set_common_channel_layouts_from_list2
int ff_set_common_channel_layouts_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const AVChannelLayout *fmts)
Definition: formats.c:1025
WhisperContext::audio_buffer_queue_size
int audio_buffer_queue_size
Definition: af_whisper.c:62
WhisperContext::use_gpu
bool use_gpu
Definition: af_whisper.c:45
AVDictionary
Definition: dict.c:32
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:220
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:639
WhisperContext::avio_context
AVIOContext * avio_context
Definition: af_whisper.c:70
formats.h
ff_inlink_consume_frame
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
Definition: avfilter.c:1515
whisper_class
static const AVClass whisper_class
Definition: af_whisper.c:459
ff_af_whisper
const FFFilter ff_af_whisper
Definition: af_whisper.c:466
samplefmt.h
run_transcription
static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
Definition: af_whisper.c:186
WhisperContext::vad_threshold
float vad_threshold
Definition: af_whisper.c:48
pts
static int64_t pts
Definition: transcode_aac.c:644
AV_DICT_DONT_STRDUP_VAL
#define AV_DICT_DONT_STRDUP_VAL
Take ownership of a value that's been allocated with av_malloc() or another memory allocation functio...
Definition: dict.h:79
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:205
init
static int init(AVFilterContext *ctx)
Definition: af_whisper.c:89
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:210
WhisperContext::gpu_device
int gpu_device
Definition: af_whisper.c:46
FFFilter
Definition: filters.h:267
WhisperContext::audio_buffer_start_ms
int64_t audio_buffer_start_ms
Definition: af_whisper.c:65
float
float
Definition: af_crystalizer.c:122
ff_outlink_set_status
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
Definition: filters.h:629
AVIO_FLAG_WRITE
#define AVIO_FLAG_WRITE
write-only
Definition: avio.h:618
ff_set_common_samplerates_from_list2
int ff_set_common_samplerates_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *samplerates)
Definition: formats.c:1049
AV_LOG_DEBUG
#define AV_LOG_DEBUG
Stuff which is only useful for libav* developers.
Definition: log.h:231
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
av_rescale_q
int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq)
Rescale a 64-bit integer by 2 rational numbers.
Definition: mathematics.c:142
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: filters.h:265
WhisperContext::max_len
int max_len
Definition: af_whisper.c:55
if
if(ret)
Definition: filter_design.txt:179
LIBAVUTIL_VERSION_INT
#define LIBAVUTIL_VERSION_INT
Definition: version.h:85
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:203
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:76
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
Definition: af_whisper.c:298
metadata
Stream codec metadata
Definition: ogg-flac-chained-meta.txt:2
NULL
#define NULL
Definition: coverity.c:32
format
New swscale design to change SwsGraph is what coordinates multiple passes These can include cascaded scaling error diffusion and so on Or we could have separate passes for the vertical and horizontal scaling In between each SwsPass lies a fully allocated image buffer Graph passes may have different levels of e g we can have a single threaded error diffusion pass following a multi threaded scaling pass SwsGraph is internally recreated whenever the image format
Definition: swscale-v2.txt:14
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
av_strireplace
char * av_strireplace(const char *str, const char *from, const char *to)
Locale-independent strings replace.
Definition: avstring.c:230
av_strnlen
size_t static size_t av_strnlen(const char *s, size_t len)
Get the count of continuous non zero chars starting from the beginning.
Definition: avstring.h:141
av_default_item_name
const char * av_default_item_name(void *ptr)
Return the context name.
Definition: log.c:242
WhisperContext::audio_buffer_fill_size
int audio_buffer_fill_size
Definition: af_whisper.c:63
WhisperContext::vad_model_path
char * vad_model_path
Definition: af_whisper.c:47
ff_audio_default_filterpad
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
Definition: audio.c:34
HOURS
#define HOURS
Definition: af_whisper.c:441
ff_inlink_acknowledge_status
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
Definition: avfilter.c:1462
AVOnce
#define AVOnce
Definition: thread.h:202
AVFilterFormatsConfig
Lists of formats / etc.
Definition: avfilter.h:121
WhisperContext::model_path
char * model_path
Definition: af_whisper.c:43
ff_inlink_queued_frames
size_t ff_inlink_queued_frames(AVFilterLink *link)
Get the number of frames available on the link.
Definition: avfilter.c:1478
av_log_level
static atomic_int av_log_level
Definition: log.c:59
WhisperContext::format
char * format
Definition: af_whisper.c:54
AVIOContext
Bytestream IO Context.
Definition: avio.h:160
activate
static int activate(AVFilterContext *ctx)
Definition: af_whisper.c:383
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
AVChannelLayout
An AVChannelLayout holds information about the channel layout of audio data.
Definition: channel_layout.h:319
WhisperContext::index
int index
Definition: af_whisper.c:71
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
av_err2str
#define av_err2str(errnum)
Convenience macro, the return value should be used only directly in function arguments but never stan...
Definition: error.h:122
AV_SAMPLE_FMT_NONE
@ AV_SAMPLE_FMT_NONE
Definition: samplefmt.h:56
avio.h
WhisperContext::eof
int eof
Definition: af_whisper.c:67
AV_NOPTS_VALUE
#define AV_NOPTS_VALUE
Undefined timestamp value.
Definition: avutil.h:247
user_data
static int FUNC() user_data(CodedBitstreamContext *ctx, RWContext *rw, MPEG2RawUserData *current)
Definition: cbs_mpeg2_syntax_template.c:59
avio_write
void avio_write(AVIOContext *s, const unsigned char *buf, int size)
Definition: aviobuf.c:206
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
WhisperContext::vad_min_silence_duration
int64_t vad_min_silence_duration
Definition: af_whisper.c:50
query_formats
static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
Definition: af_whisper.c:419
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:221
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Underlying C type is float.
Definition: opt.h:271
AVIOContext::direct
int direct
avio_read and avio_write should if possible be satisfied directly instead of going through a buffer,...
Definition: avio.h:268
internal.h
AV_TIME_BASE
#define AV_TIME_BASE
Internal time base represented as integer.
Definition: avutil.h:253
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:32
ff_filter_get_nb_threads
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
Definition: avfilter.c:845
AVSampleFormat
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:55
FILTER_QUERY_FUNC2
#define FILTER_QUERY_FUNC2(func)
Definition: filters.h:241
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
OFFSET
#define OFFSET(x)
Definition: af_whisper.c:439
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
uninit
static void uninit(AVFilterContext *ctx)
Definition: af_whisper.c:160
language
Undefined Behavior In the C language
Definition: undefined.txt:3
cb_log
static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
Definition: af_whisper.c:74
ret
ret
Definition: filter_design.txt:187
AVClass::class_name
const char * class_name
The name of the class; usually it is the same name as the context structure type to which the AVClass...
Definition: log.h:81
frame
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
Definition: filter_design.txt:265
WhisperContext::audio_buffer
float * audio_buffer
Definition: af_whisper.c:61
FF_COUNT2LAYOUT
#define FF_COUNT2LAYOUT(c)
Encode a channel count as a channel layout.
Definition: formats.h:102
ff_set_sample_formats_from_list2
int ff_set_sample_formats_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const enum AVSampleFormat *fmts)
Definition: formats.c:1153
push_last_frame
static int push_last_frame(AVFilterLink *outlink)
Definition: af_whisper.c:357
status
ov_status_e status
Definition: dnn_backend_openvino.c:100
channel_layout.h
WhisperContext::ctx_vad
struct whisper_vad_context * ctx_vad
Definition: af_whisper.c:58
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Underlying C type is int.
Definition: opt.h:259
avfilter.h
WhisperContext::ctx_wsp
struct whisper_context * ctx_wsp
Definition: af_whisper.c:57
AVFILTER_FLAG_METADATA_ONLY
#define AVFILTER_FLAG_METADATA_ONLY
The filter is a "metadata" filter - it does not modify the frame data in any way.
Definition: avfilter.h:183
WhisperContext::destination
char * destination
Definition: af_whisper.c:53
samples
Filter the word “frame” indicates either a video frame or a group of audio samples
Definition: filter_design.txt:8
AVFilterContext
An instance of a filter.
Definition: avfilter.h:274
AVIO_FLAG_DIRECT
#define AVIO_FLAG_DIRECT
Use direct mode.
Definition: avio.h:644
WhisperContext::next_pts
int64_t next_pts
Definition: af_whisper.c:68
av_strdup
char * av_strdup(const char *s)
Duplicate a string.
Definition: mem.c:272
FFFilter::p
AVFilter p
The public AVFilter.
Definition: filters.h:271
FLAGS
#define FLAGS
Definition: af_whisper.c:440
avutil.h
mem.h
audio.h
AV_OPT_TYPE_BOOL
@ AV_OPT_TYPE_BOOL
Underlying C type is int.
Definition: opt.h:327
avio_closep
int avio_closep(AVIOContext **s)
Close the resource accessed by the AVIOContext *s, free it and set the pointer pointing to it to NULL...
Definition: avio.c:650
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:35
av_dict_set
int av_dict_set(AVDictionary **pm, const char *key, const char *value, int flags)
Set the given entry in *pm, overwriting an existing entry.
Definition: dict.c:86
WhisperContext::vad_min_speech_duration
int64_t vad_min_speech_duration
Definition: af_whisper.c:49
WhisperContext::vad_params
struct whisper_vad_params vad_params
Definition: af_whisper.c:59
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
WhisperContext::queue
int64_t queue
Definition: af_whisper.c:52
avstring.h
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
Definition: opt.h:276
input_data
static void input_data(MLPEncodeContext *ctx, MLPSubstream *s, uint8_t **const samples, int nb_samples)
Wrapper function for inputting data in two different bit-depths.
Definition: mlpenc.c:1219
AV_SAMPLE_FMT_FLT
@ AV_SAMPLE_FMT_FLT
float
Definition: samplefmt.h:60
duration
static int64_t duration
Definition: ffplay.c:329