FFmpeg
libspeexenc.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2009 Justin Ruggles
3  * Copyright (c) 2009 Xuggle Incorporated
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * libspeex Speex audio encoder
25  *
26  * Usage Guide
27  * This explains the values that need to be set prior to initialization in
28  * order to control various encoding parameters.
29  *
30  * Channels
31  * Speex only supports mono or stereo, so avctx->channels must be set to
32  * 1 or 2.
33  *
34  * Sample Rate / Encoding Mode
35  * Speex has 3 modes, each of which uses a specific sample rate.
36  * narrowband : 8 kHz
37  * wideband : 16 kHz
38  * ultra-wideband : 32 kHz
39  * avctx->sample_rate must be set to one of these 3 values. This will be
40  * used to set the encoding mode.
41  *
42  * Rate Control
43  * VBR mode is turned on by setting AV_CODEC_FLAG_QSCALE in avctx->flags.
44  * avctx->global_quality is used to set the encoding quality.
45  * For CBR mode, avctx->bit_rate can be used to set the constant bitrate.
46  * Alternatively, the 'cbr_quality' option can be set from 0 to 10 to set
47  * a constant bitrate based on quality.
48  * For ABR mode, set avctx->bit_rate and set the 'abr' option to 1.
49  * Approx. Bitrate Range:
50  * narrowband : 2400 - 25600 bps
51  * wideband : 4000 - 43200 bps
52  * ultra-wideband : 4400 - 45200 bps
53  *
54  * Complexity
55  * Encoding complexity is controlled by setting avctx->compression_level.
56  * The valid range is 0 to 10. A higher setting gives generally better
57  * quality at the expense of encoding speed. This does not affect the
58  * bit rate.
59  *
60  * Frames-per-Packet
61  * The encoder defaults to using 1 frame-per-packet. However, it is
62  * sometimes desirable to use multiple frames-per-packet to reduce the
63  * amount of container overhead. This can be done by setting the
64  * 'frames_per_packet' option to a value 1 to 8.
65  *
66  *
67  * Optional features
68  * Speex encoder supports several optional features, which can be useful
69  * for some conditions.
70  *
71  * Voice Activity Detection
72  * When enabled, voice activity detection detects whether the audio
73  * being encoded is speech or silence/background noise. VAD is always
74  * implicitly activated when encoding in VBR, so the option is only useful
75  * in non-VBR operation. In this case, Speex detects non-speech periods and
76  * encodes them with just enough bits to reproduce the background noise.
77  *
78  * Discontinuous Transmission (DTX)
79  * DTX is an addition to VAD/VBR operation, that makes it possible to stop transmitting
80  * completely when the background noise is stationary.
81  * In file-based operation only 5 bits are used for such frames.
82  */
83 
84 #include <speex/speex.h>
85 #include <speex/speex_header.h>
86 #include <speex/speex_stereo.h>
87 
89 #include "libavutil/common.h"
90 #include "libavutil/opt.h"
91 #include "avcodec.h"
92 #include "internal.h"
93 #include "audio_frame_queue.h"
94 
95 /* TODO: Think about converting abr, vad, dtx and such flags to a bit field */
96 typedef struct LibSpeexEncContext {
97  AVClass *class; ///< AVClass for private options
98  SpeexBits bits; ///< libspeex bitwriter context
99  SpeexHeader header; ///< libspeex header struct
100  void *enc_state; ///< libspeex encoder state
101  int frames_per_packet; ///< number of frames to encode in each packet
102  float vbr_quality; ///< VBR quality 0.0 to 10.0
103  int cbr_quality; ///< CBR quality 0 to 10
104  int abr; ///< flag to enable ABR
105  int vad; ///< flag to enable VAD
106  int dtx; ///< flag to enable DTX
107  int pkt_frame_count; ///< frame count for the current packet
108  AudioFrameQueue afq; ///< frame queue
110 
113 {
114  const char *mode_str = "unknown";
115 
116  av_log(avctx, AV_LOG_DEBUG, "channels: %d\n", avctx->channels);
117  switch (s->header.mode) {
118  case SPEEX_MODEID_NB: mode_str = "narrowband"; break;
119  case SPEEX_MODEID_WB: mode_str = "wideband"; break;
120  case SPEEX_MODEID_UWB: mode_str = "ultra-wideband"; break;
121  }
122  av_log(avctx, AV_LOG_DEBUG, "mode: %s\n", mode_str);
123  if (s->header.vbr) {
124  av_log(avctx, AV_LOG_DEBUG, "rate control: VBR\n");
125  av_log(avctx, AV_LOG_DEBUG, " quality: %f\n", s->vbr_quality);
126  } else if (s->abr) {
127  av_log(avctx, AV_LOG_DEBUG, "rate control: ABR\n");
128  av_log(avctx, AV_LOG_DEBUG, " bitrate: %"PRId64" bps\n", avctx->bit_rate);
129  } else {
130  av_log(avctx, AV_LOG_DEBUG, "rate control: CBR\n");
131  av_log(avctx, AV_LOG_DEBUG, " bitrate: %"PRId64" bps\n", avctx->bit_rate);
132  }
133  av_log(avctx, AV_LOG_DEBUG, "complexity: %d\n",
134  avctx->compression_level);
135  av_log(avctx, AV_LOG_DEBUG, "frame size: %d samples\n",
136  avctx->frame_size);
137  av_log(avctx, AV_LOG_DEBUG, "frames per packet: %d\n",
138  s->frames_per_packet);
139  av_log(avctx, AV_LOG_DEBUG, "packet size: %d\n",
140  avctx->frame_size * s->frames_per_packet);
141  av_log(avctx, AV_LOG_DEBUG, "voice activity detection: %d\n", s->vad);
142  av_log(avctx, AV_LOG_DEBUG, "discontinuous transmission: %d\n", s->dtx);
143 }
144 
146 {
147  LibSpeexEncContext *s = avctx->priv_data;
148  const SpeexMode *mode;
149  uint8_t *header_data;
150  int header_size;
151  int32_t complexity;
152 
153  /* channels */
154  if (avctx->channels < 1 || avctx->channels > 2) {
155  av_log(avctx, AV_LOG_ERROR, "Invalid channels (%d). Only stereo and "
156  "mono are supported\n", avctx->channels);
157  return AVERROR(EINVAL);
158  }
159 
160  /* sample rate and encoding mode */
161  switch (avctx->sample_rate) {
162  case 8000: mode = speex_lib_get_mode(SPEEX_MODEID_NB); break;
163  case 16000: mode = speex_lib_get_mode(SPEEX_MODEID_WB); break;
164  case 32000: mode = speex_lib_get_mode(SPEEX_MODEID_UWB); break;
165  default:
166  av_log(avctx, AV_LOG_ERROR, "Sample rate of %d Hz is not supported. "
167  "Resample to 8, 16, or 32 kHz.\n", avctx->sample_rate);
168  return AVERROR(EINVAL);
169  }
170 
171  /* initialize libspeex */
172  s->enc_state = speex_encoder_init(mode);
173  if (!s->enc_state) {
174  av_log(avctx, AV_LOG_ERROR, "Error initializing libspeex\n");
175  return -1;
176  }
177  speex_init_header(&s->header, avctx->sample_rate, avctx->channels, mode);
178 
179  /* rate control method and parameters */
180  if (avctx->flags & AV_CODEC_FLAG_QSCALE) {
181  /* VBR */
182  s->header.vbr = 1;
183  s->vad = 1; /* VAD is always implicitly activated for VBR */
184  speex_encoder_ctl(s->enc_state, SPEEX_SET_VBR, &s->header.vbr);
185  s->vbr_quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
186  0.0f, 10.0f);
187  speex_encoder_ctl(s->enc_state, SPEEX_SET_VBR_QUALITY, &s->vbr_quality);
188  } else {
189  s->header.bitrate = avctx->bit_rate;
190  if (avctx->bit_rate > 0) {
191  /* CBR or ABR by bitrate */
192  if (s->abr) {
193  speex_encoder_ctl(s->enc_state, SPEEX_SET_ABR,
194  &s->header.bitrate);
195  speex_encoder_ctl(s->enc_state, SPEEX_GET_ABR,
196  &s->header.bitrate);
197  } else {
198  speex_encoder_ctl(s->enc_state, SPEEX_SET_BITRATE,
199  &s->header.bitrate);
200  speex_encoder_ctl(s->enc_state, SPEEX_GET_BITRATE,
201  &s->header.bitrate);
202  }
203  } else {
204  /* CBR by quality */
205  speex_encoder_ctl(s->enc_state, SPEEX_SET_QUALITY,
206  &s->cbr_quality);
207  speex_encoder_ctl(s->enc_state, SPEEX_GET_BITRATE,
208  &s->header.bitrate);
209  }
210  /* stereo side information adds about 800 bps to the base bitrate */
211  /* TODO: this should be calculated exactly */
212  avctx->bit_rate = s->header.bitrate + (avctx->channels == 2 ? 800 : 0);
213  }
214 
215  /* VAD is activated with VBR or can be turned on by itself */
216  if (s->vad)
217  speex_encoder_ctl(s->enc_state, SPEEX_SET_VAD, &s->vad);
218 
219  /* Activating Discontinuous Transmission */
220  if (s->dtx) {
221  speex_encoder_ctl(s->enc_state, SPEEX_SET_DTX, &s->dtx);
222  if (!(s->abr || s->vad || s->header.vbr))
223  av_log(avctx, AV_LOG_WARNING, "DTX is not much of use without ABR, VAD or VBR\n");
224  }
225 
226  /* set encoding complexity */
228  complexity = av_clip(avctx->compression_level, 0, 10);
229  speex_encoder_ctl(s->enc_state, SPEEX_SET_COMPLEXITY, &complexity);
230  }
231  speex_encoder_ctl(s->enc_state, SPEEX_GET_COMPLEXITY, &complexity);
232  avctx->compression_level = complexity;
233 
234  /* set packet size */
235  avctx->frame_size = s->header.frame_size;
236  s->header.frames_per_packet = s->frames_per_packet;
237 
238  /* set encoding delay */
239  speex_encoder_ctl(s->enc_state, SPEEX_GET_LOOKAHEAD, &avctx->initial_padding);
240  ff_af_queue_init(avctx, &s->afq);
241 
242  /* create header packet bytes from header struct */
243  /* note: libspeex allocates the memory for header_data, which is freed
244  below with speex_header_free() */
245  header_data = speex_header_to_packet(&s->header, &header_size);
246 
247  /* allocate extradata */
248  avctx->extradata = av_malloc(header_size + AV_INPUT_BUFFER_PADDING_SIZE);
249  if (!avctx->extradata) {
250  speex_header_free(header_data);
251  speex_encoder_destroy(s->enc_state);
252  av_log(avctx, AV_LOG_ERROR, "memory allocation error\n");
253  return AVERROR(ENOMEM);
254  }
255 
256  /* copy header packet to extradata */
257  memcpy(avctx->extradata, header_data, header_size);
258  avctx->extradata_size = header_size;
259  speex_header_free(header_data);
260 
261  /* init libspeex bitwriter */
262  speex_bits_init(&s->bits);
263 
264  print_enc_params(avctx, s);
265  return 0;
266 }
267 
268 static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
269  const AVFrame *frame, int *got_packet_ptr)
270 {
271  LibSpeexEncContext *s = avctx->priv_data;
272  int16_t *samples = frame ? (int16_t *)frame->data[0] : NULL;
273  int ret;
274 
275  if (samples) {
276  /* encode Speex frame */
277  if (avctx->channels == 2)
278  speex_encode_stereo_int(samples, s->header.frame_size, &s->bits);
279  speex_encode_int(s->enc_state, samples, &s->bits);
280  s->pkt_frame_count++;
281  if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
282  return ret;
283  } else {
284  /* handle end-of-stream */
285  if (!s->pkt_frame_count)
286  return 0;
287  /* add extra terminator codes for unused frames in last packet */
288  while (s->pkt_frame_count < s->frames_per_packet) {
289  speex_bits_pack(&s->bits, 15, 5);
290  s->pkt_frame_count++;
291  }
292  }
293 
294  /* write output if all frames for the packet have been encoded */
295  if (s->pkt_frame_count == s->frames_per_packet) {
296  s->pkt_frame_count = 0;
297  if ((ret = ff_alloc_packet2(avctx, avpkt, speex_bits_nbytes(&s->bits), 0)) < 0)
298  return ret;
299  ret = speex_bits_write(&s->bits, avpkt->data, avpkt->size);
300  speex_bits_reset(&s->bits);
301 
302  /* Get the next frame pts/duration */
304  &avpkt->pts, &avpkt->duration);
305 
306  avpkt->size = ret;
307  *got_packet_ptr = 1;
308  return 0;
309  }
310  return 0;
311 }
312 
314 {
315  LibSpeexEncContext *s = avctx->priv_data;
316 
317  speex_bits_destroy(&s->bits);
318  speex_encoder_destroy(s->enc_state);
319 
320  ff_af_queue_close(&s->afq);
321  av_freep(&avctx->extradata);
322 
323  return 0;
324 }
325 
326 #define OFFSET(x) offsetof(LibSpeexEncContext, x)
327 #define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
328 static const AVOption options[] = {
329  { "abr", "Use average bit rate", OFFSET(abr), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AE },
330  { "cbr_quality", "Set quality value (0 to 10) for CBR", OFFSET(cbr_quality), AV_OPT_TYPE_INT, { .i64 = 8 }, 0, 10, AE },
331  { "frames_per_packet", "Number of frames to encode in each packet", OFFSET(frames_per_packet), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 8, AE },
332  { "vad", "Voice Activity Detection", OFFSET(vad), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AE },
333  { "dtx", "Discontinuous Transmission", OFFSET(dtx), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AE },
334  { NULL },
335 };
336 
337 static const AVClass speex_class = {
338  .class_name = "libspeex",
339  .item_name = av_default_item_name,
340  .option = options,
341  .version = LIBAVUTIL_VERSION_INT,
342 };
343 
344 static const AVCodecDefault defaults[] = {
345  { "b", "0" },
346  { "compression_level", "3" },
347  { NULL },
348 };
349 
351  .name = "libspeex",
352  .long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"),
353  .type = AVMEDIA_TYPE_AUDIO,
354  .id = AV_CODEC_ID_SPEEX,
355  .priv_data_size = sizeof(LibSpeexEncContext),
356  .init = encode_init,
357  .encode2 = encode_frame,
358  .close = encode_close,
359  .capabilities = AV_CODEC_CAP_DELAY,
360  .sample_fmts = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
362  .channel_layouts = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
364  0 },
365  .supported_samplerates = (const int[]){ 8000, 16000, 32000, 0 },
366  .priv_class = &speex_class,
367  .defaults = defaults,
368  .wrapper_name = "libspeex",
369 };
int pkt_frame_count
frame count for the current packet
Definition: libspeexenc.c:107
void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts, int64_t *duration)
Remove frame(s) from the queue.
#define NULL
Definition: coverity.c:32
#define FF_COMPRESSION_DEFAULT
Definition: avcodec.h:1643
static av_cold int encode_init(AVCodecContext *avctx)
Definition: libspeexenc.c:145
This structure describes decoded (raw) audio or video data.
Definition: frame.h:295
AVOption.
Definition: opt.h:246
static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame, int *got_packet_ptr)
Definition: libspeexenc.c:268
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:182
int64_t bit_rate
the average bitrate
Definition: avcodec.h:1620
#define LIBAVUTIL_VERSION_INT
Definition: version.h:85
static av_cold int init(AVCodecContext *avctx)
Definition: avrndec.c:35
int size
Definition: avcodec.h:1483
const char * av_default_item_name(void *ptr)
Return the context name.
Definition: log.c:191
int cbr_quality
CBR quality 0 to 10.
Definition: libspeexenc.c:103
int vad
flag to enable VAD
Definition: libspeexenc.c:105
SpeexBits bits
libspeex bitwriter context
Definition: libspeexenc.c:98
static av_cold int encode_close(AVCodecContext *avctx)
Definition: libspeexenc.c:313
#define AV_CH_LAYOUT_STEREO
int dtx
flag to enable DTX
Definition: libspeexenc.c:106
AVCodec.
Definition: avcodec.h:3494
const char * class_name
The name of the class; usually it is the same name as the context structure type to which the AVClass...
Definition: log.h:72
#define AV_CODEC_CAP_DELAY
Encoder or decoder requires flushing with NULL input at the end in order to give the complete and cor...
Definition: avcodec.h:1011
int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size)
Check AVPacket size and/or allocate data.
Definition: encode.c:32
uint8_t
#define av_cold
Definition: attributes.h:82
#define av_malloc(s)
AVOptions.
AudioFrameQueue afq
frame queue
Definition: libspeexenc.c:108
av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
Initialize AudioFrameQueue.
int64_t duration
Duration of this packet in AVStream->time_base units, 0 if unknown.
Definition: avcodec.h:1500
uint8_t * extradata
some codecs need / can use extradata like Huffman tables.
Definition: avcodec.h:1671
AVCodec ff_libspeex_encoder
Definition: libspeexenc.c:350
uint8_t * data
Definition: avcodec.h:1482
#define av_log(a,...)
int frames_per_packet
number of frames to encode in each packet
Definition: libspeexenc.c:101
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
static const AVCodecDefault defaults[]
Definition: libspeexenc.c:344
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
Definition: internal.h:186
int initial_padding
Audio only.
Definition: avcodec.h:3101
#define AV_LOG_DEBUG
Stuff which is only useful for libav* developers.
Definition: log.h:197
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:1650
const char * name
Name of the codec implementation.
Definition: avcodec.h:3501
#define AE
Definition: libspeexenc.c:327
int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f)
Add a frame to the queue.
audio channel layout utility functions
#define AV_CODEC_FLAG_QSCALE
Use fixed qscale.
Definition: avcodec.h:855
static const AVClass speex_class
Definition: libspeexenc.c:337
int32_t
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
#define s(width, name)
Definition: cbs_vp9.c:257
if(ret)
int frame_size
Number of samples per channel in an audio frame.
Definition: avcodec.h:2250
void * enc_state
libspeex encoder state
Definition: libspeexenc.c:100
Libavcodec external API header.
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:58
#define OFFSET(x)
Definition: libspeexenc.c:326
int compression_level
Definition: avcodec.h:1642
int sample_rate
samples per second
Definition: avcodec.h:2230
main external API structure.
Definition: avcodec.h:1570
int extradata_size
Definition: avcodec.h:1672
Describe the class of an AVClass context structure.
Definition: log.h:67
int global_quality
Global quality for codecs which cannot change it per frame.
Definition: avcodec.h:1636
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:309
common internal api header.
common internal and external API header
static av_cold void print_enc_params(AVCodecContext *avctx, LibSpeexEncContext *s)
Definition: libspeexenc.c:111
signed 16 bits
Definition: samplefmt.h:61
SpeexHeader header
libspeex header struct
Definition: libspeexenc.c:99
static const AVOption options[]
Definition: libspeexenc.c:328
#define AV_INPUT_BUFFER_PADDING_SIZE
Required number of additionally allocated bytes at the end of the input bitstream for decoding...
Definition: avcodec.h:795
void * priv_data
Definition: avcodec.h:1597
int channels
number of audio channels
Definition: avcodec.h:2231
int abr
flag to enable ABR
Definition: libspeexenc.c:104
#define FF_QP2LAMBDA
factor to convert from H.263 QP to lambda
Definition: avutil.h:227
void ff_af_queue_close(AudioFrameQueue *afq)
Close AudioFrameQueue.
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:701
Filter the word “frame” indicates either a video frame or a group of audio samples
#define av_freep(p)
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later.That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another.Frame references ownership and permissions
float vbr_quality
VBR quality 0.0 to 10.0.
Definition: libspeexenc.c:102
#define AV_CH_LAYOUT_MONO
This structure stores compressed data.
Definition: avcodec.h:1459
mode
Use these values in ebur128_init (or&#39;ed).
Definition: ebur128.h:83
int64_t pts
Presentation timestamp in AVStream->time_base units; the time at which the decompressed packet will b...
Definition: avcodec.h:1475