FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem.h"
33 #include "avcodec.h"
34 #include "internal.h"
35 #include "get_bits.h"
36 #include "put_bits.h"
37 #include "wmavoice_data.h"
38 #include "celp_filters.h"
39 #include "acelp_vectors.h"
40 #include "acelp_filters.h"
41 #include "lsp.h"
42 #include "dct.h"
43 #include "rdft.h"
44 #include "sinewin.h"
45 
46 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
47 #define MAX_LSPS 16 ///< maximum filter order
48 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
49  ///< of 16 for ASM input buffer alignment
50 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
51 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
52 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
53 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
54  ///< maximum number of samples per superframe
55 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
56  ///< was split over two packets
57 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
58 
59 /**
60  * Frame type VLC coding.
61  */
63 
64 /**
65  * Adaptive codebook types.
66  */
67 enum {
68  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
69  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
70  ///< we interpolate to get a per-sample pitch.
71  ///< Signal is generated using an asymmetric sinc
72  ///< window function
73  ///< @note see #wmavoice_ipol1_coeffs
74  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
75  ///< a Hamming sinc window function
76  ///< @note see #wmavoice_ipol2_coeffs
77 };
78 
79 /**
80  * Fixed codebook types.
81  */
82 enum {
83  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
84  ///< generated from a hardcoded (fixed) codebook
85  ///< with per-frame (low) gain values
86  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
87  ///< gain values
88  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
89  ///< used in particular for low-bitrate streams
90  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
91  ///< combinations of either single pulses or
92  ///< pulse pairs
93 };
94 
95 /**
96  * Description of frame types.
97  */
98 static const struct frame_type_desc {
99  uint8_t n_blocks; ///< amount of blocks per frame (each block
100  ///< (contains 160/#n_blocks samples)
101  uint8_t log_n_blocks; ///< log2(#n_blocks)
102  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
103  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
104  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
105  ///< (rather than just one single pulse)
106  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
107  uint16_t frame_size; ///< the amount of bits that make up the block
108  ///< data (per frame)
109 } frame_descs[17] = {
110  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
111  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
112  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
113  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
114  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
115  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
116  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
117  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
118  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
119  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
120  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
121  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
122  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
123  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
124  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
125  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
126  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
127 };
128 
129 /**
130  * WMA Voice decoding context.
131  */
132 typedef struct WMAVoiceContext {
133  /**
134  * @name Global values specified in the stream header / extradata or used all over.
135  * @{
136  */
137  GetBitContext gb; ///< packet bitreader. During decoder init,
138  ///< it contains the extradata from the
139  ///< demuxer. During decoding, it contains
140  ///< packet data.
141  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
142 
143  int spillover_bitsize; ///< number of bits used to specify
144  ///< #spillover_nbits in the packet header
145  ///< = ceil(log2(ctx->block_align << 3))
146  int history_nsamples; ///< number of samples in history for signal
147  ///< prediction (through ACB)
148 
149  /* postfilter specific values */
150  int do_apf; ///< whether to apply the averaged
151  ///< projection filter (APF)
152  int denoise_strength; ///< strength of denoising in Wiener filter
153  ///< [0-11]
154  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
155  ///< Wiener filter coefficients (postfilter)
156  int dc_level; ///< Predicted amount of DC noise, based
157  ///< on which a DC removal filter is used
158 
159  int lsps; ///< number of LSPs per frame [10 or 16]
160  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
161  int lsp_def_mode; ///< defines different sets of LSP defaults
162  ///< [0, 1]
163  int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
164  ///< per-frame (independent coding)
165  int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
166  ///< per superframe (residual coding)
167 
168  int min_pitch_val; ///< base value for pitch parsing code
169  int max_pitch_val; ///< max value + 1 for pitch parsing
170  int pitch_nbits; ///< number of bits used to specify the
171  ///< pitch value in the frame header
172  int block_pitch_nbits; ///< number of bits used to specify the
173  ///< first block's pitch value
174  int block_pitch_range; ///< range of the block pitch
175  int block_delta_pitch_nbits; ///< number of bits used to specify the
176  ///< delta pitch between this and the last
177  ///< block's pitch value, used in all but
178  ///< first block
179  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
180  ///< from -this to +this-1)
181  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
182  ///< conversion
183 
184  /**
185  * @}
186  *
187  * @name Packet values specified in the packet header or related to a packet.
188  *
189  * A packet is considered to be a single unit of data provided to this
190  * decoder by the demuxer.
191  * @{
192  */
193  int spillover_nbits; ///< number of bits of the previous packet's
194  ///< last superframe preceding this
195  ///< packet's first full superframe (useful
196  ///< for re-synchronization also)
197  int has_residual_lsps; ///< if set, superframes contain one set of
198  ///< LSPs that cover all frames, encoded as
199  ///< independent and residual LSPs; if not
200  ///< set, each frame contains its own, fully
201  ///< independent, LSPs
202  int skip_bits_next; ///< number of bits to skip at the next call
203  ///< to #wmavoice_decode_packet() (since
204  ///< they're part of the previous superframe)
205 
207  ///< cache for superframe data split over
208  ///< multiple packets
209  int sframe_cache_size; ///< set to >0 if we have data from an
210  ///< (incomplete) superframe from a previous
211  ///< packet that spilled over in the current
212  ///< packet; specifies the amount of bits in
213  ///< #sframe_cache
214  PutBitContext pb; ///< bitstream writer for #sframe_cache
215 
216  /**
217  * @}
218  *
219  * @name Frame and superframe values
220  * Superframe and frame data - these can change from frame to frame,
221  * although some of them do in that case serve as a cache / history for
222  * the next frame or superframe.
223  * @{
224  */
225  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
226  ///< superframe
227  int last_pitch_val; ///< pitch value of the previous frame
228  int last_acb_type; ///< frame type [0-2] of the previous frame
229  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
230  ///< << 16) / #MAX_FRAMESIZE
231  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
232 
233  int aw_idx_is_ext; ///< whether the AW index was encoded in
234  ///< 8 bits (instead of 6)
235  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
236  ///< can apply the pulse, relative to the
237  ///< value in aw_first_pulse_off. The exact
238  ///< position of the first AW-pulse is within
239  ///< [pulse_off, pulse_off + this], and
240  ///< depends on bitstream values; [16 or 24]
241  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
242  ///< that this number can be negative (in
243  ///< which case it basically means "zero")
244  int aw_first_pulse_off[2]; ///< index of first sample to which to
245  ///< apply AW-pulses, or -0xff if unset
246  int aw_next_pulse_off_cache; ///< the position (relative to start of the
247  ///< second block) at which pulses should
248  ///< start to be positioned, serves as a
249  ///< cache for pitch-adaptive window pulses
250  ///< between blocks
251 
252  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
253  ///< only used for comfort noise in #pRNG()
254  int nb_superframes; ///< number of superframes in current packet
255  float gain_pred_err[6]; ///< cache for gain prediction
257  ///< cache of the signal of previous
258  ///< superframes, used as a history for
259  ///< signal generation
260  float synth_history[MAX_LSPS]; ///< see #excitation_history
261  /**
262  * @}
263  *
264  * @name Postfilter values
265  *
266  * Variables used for postfilter implementation, mostly history for
267  * smoothing and so on, and context variables for FFT/iFFT.
268  * @{
269  */
270  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
271  ///< postfilter (for denoise filter)
272  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
273  ///< transform, part of postfilter)
274  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
275  ///< range
276  float postfilter_agc; ///< gain control memory, used in
277  ///< #adaptive_gain_control()
278  float dcf_mem[2]; ///< DC filter history
280  ///< zero filter output (i.e. excitation)
281  ///< by postfilter
283  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
284  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
285  ///< aligned buffer for LPC tilting
287  ///< aligned buffer for denoise coefficients
289  ///< aligned buffer for postfilter speech
290  ///< synthesis
291  /**
292  * @}
293  */
295 
296 /**
297  * Set up the variable bit mode (VBM) tree from container extradata.
298  * @param gb bit I/O context.
299  * The bit context (s->gb) should be loaded with byte 23-46 of the
300  * container extradata (i.e. the ones containing the VBM tree).
301  * @param vbm_tree pointer to array to which the decoded VBM tree will be
302  * written.
303  * @return 0 on success, <0 on error.
304  */
305 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
306 {
307  int cntr[8] = { 0 }, n, res;
308 
309  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
310  for (n = 0; n < 17; n++) {
311  res = get_bits(gb, 3);
312  if (cntr[res] > 3) // should be >= 3 + (res == 7))
313  return -1;
314  vbm_tree[res * 3 + cntr[res]++] = n;
315  }
316  return 0;
317 }
318 
320 {
321  static const uint8_t bits[] = {
322  2, 2, 2, 4, 4, 4,
323  6, 6, 6, 8, 8, 8,
324  10, 10, 10, 12, 12, 12,
325  14, 14, 14, 14
326  };
327  static const uint16_t codes[] = {
328  0x0000, 0x0001, 0x0002, // 00/01/10
329  0x000c, 0x000d, 0x000e, // 11+00/01/10
330  0x003c, 0x003d, 0x003e, // 1111+00/01/10
331  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
332  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
333  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
334  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
335  };
336 
337  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
338  bits, 1, 1, codes, 2, 2, 132);
339 }
340 
342 {
343  WMAVoiceContext *s = ctx->priv_data;
344  int n;
345 
346  s->postfilter_agc = 0;
347  s->sframe_cache_size = 0;
348  s->skip_bits_next = 0;
349  for (n = 0; n < s->lsps; n++)
350  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
351  memset(s->excitation_history, 0,
352  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
353  memset(s->synth_history, 0,
354  sizeof(*s->synth_history) * MAX_LSPS);
355  memset(s->gain_pred_err, 0,
356  sizeof(s->gain_pred_err));
357 
358  if (s->do_apf) {
359  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
360  sizeof(*s->synth_filter_out_buf) * s->lsps);
361  memset(s->dcf_mem, 0,
362  sizeof(*s->dcf_mem) * 2);
363  memset(s->zero_exc_pf, 0,
364  sizeof(*s->zero_exc_pf) * s->history_nsamples);
365  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
366  }
367 }
368 
369 /**
370  * Set up decoder with parameters from demuxer (extradata etc.).
371  */
373 {
374  int n, flags, pitch_range, lsp16_flag;
375  WMAVoiceContext *s = ctx->priv_data;
376 
377  /**
378  * Extradata layout:
379  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
380  * - byte 19-22: flags field (annoyingly in LE; see below for known
381  * values),
382  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
383  * rest is 0).
384  */
385  if (ctx->extradata_size != 46) {
386  av_log(ctx, AV_LOG_ERROR,
387  "Invalid extradata size %d (should be 46)\n",
388  ctx->extradata_size);
389  return AVERROR_INVALIDDATA;
390  }
391  if (ctx->block_align <= 0) {
392  av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
393  return AVERROR_INVALIDDATA;
394  }
395 
396  flags = AV_RL32(ctx->extradata + 18);
397  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
398  s->do_apf = flags & 0x1;
399  if (s->do_apf) {
400  ff_rdft_init(&s->rdft, 7, DFT_R2C);
401  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
402  ff_dct_init(&s->dct, 6, DCT_I);
403  ff_dct_init(&s->dst, 6, DST_I);
404 
405  ff_sine_window_init(s->cos, 256);
406  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
407  for (n = 0; n < 255; n++) {
408  s->sin[n] = -s->sin[510 - n];
409  s->cos[510 - n] = s->cos[n];
410  }
411  }
412  s->denoise_strength = (flags >> 2) & 0xF;
413  if (s->denoise_strength >= 12) {
414  av_log(ctx, AV_LOG_ERROR,
415  "Invalid denoise filter strength %d (max=11)\n",
416  s->denoise_strength);
417  return AVERROR_INVALIDDATA;
418  }
419  s->denoise_tilt_corr = !!(flags & 0x40);
420  s->dc_level = (flags >> 7) & 0xF;
421  s->lsp_q_mode = !!(flags & 0x2000);
422  s->lsp_def_mode = !!(flags & 0x4000);
423  lsp16_flag = flags & 0x1000;
424  if (lsp16_flag) {
425  s->lsps = 16;
426  s->frame_lsp_bitsize = 34;
427  s->sframe_lsp_bitsize = 60;
428  } else {
429  s->lsps = 10;
430  s->frame_lsp_bitsize = 24;
431  s->sframe_lsp_bitsize = 48;
432  }
433  for (n = 0; n < s->lsps; n++)
434  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
435 
436  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
437  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
438  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
439  return AVERROR_INVALIDDATA;
440  }
441 
442  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
443  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
444  pitch_range = s->max_pitch_val - s->min_pitch_val;
445  if (pitch_range <= 0) {
446  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
447  return AVERROR_INVALIDDATA;
448  }
449  s->pitch_nbits = av_ceil_log2(pitch_range);
450  s->last_pitch_val = 40;
452  s->history_nsamples = s->max_pitch_val + 8;
453 
455  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
456  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
457 
458  av_log(ctx, AV_LOG_ERROR,
459  "Unsupported samplerate %d (min=%d, max=%d)\n",
460  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
461 
462  return AVERROR(ENOSYS);
463  }
464 
465  s->block_conv_table[0] = s->min_pitch_val;
466  s->block_conv_table[1] = (pitch_range * 25) >> 6;
467  s->block_conv_table[2] = (pitch_range * 44) >> 6;
468  s->block_conv_table[3] = s->max_pitch_val - 1;
469  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
470  if (s->block_delta_pitch_hrange <= 0) {
471  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
472  return AVERROR_INVALIDDATA;
473  }
474  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
476  s->block_conv_table[3] + 1 +
477  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
478  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
479 
480  ctx->channels = 1;
483 
484  return 0;
485 }
486 
487 /**
488  * @name Postfilter functions
489  * Postfilter functions (gain control, wiener denoise filter, DC filter,
490  * kalman smoothening, plus surrounding code to wrap it)
491  * @{
492  */
493 /**
494  * Adaptive gain control (as used in postfilter).
495  *
496  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
497  * that the energy here is calculated using sum(abs(...)), whereas the
498  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
499  *
500  * @param out output buffer for filtered samples
501  * @param in input buffer containing the samples as they are after the
502  * postfilter steps so far
503  * @param speech_synth input buffer containing speech synth before postfilter
504  * @param size input buffer size
505  * @param alpha exponential filter factor
506  * @param gain_mem pointer to filter memory (single float)
507  */
508 static void adaptive_gain_control(float *out, const float *in,
509  const float *speech_synth,
510  int size, float alpha, float *gain_mem)
511 {
512  int i;
513  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
514  float mem = *gain_mem;
515 
516  for (i = 0; i < size; i++) {
517  speech_energy += fabsf(speech_synth[i]);
518  postfilter_energy += fabsf(in[i]);
519  }
520  gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
521  (1.0 - alpha) * speech_energy / postfilter_energy;
522 
523  for (i = 0; i < size; i++) {
524  mem = alpha * mem + gain_scale_factor;
525  out[i] = in[i] * mem;
526  }
527 
528  *gain_mem = mem;
529 }
530 
531 /**
532  * Kalman smoothing function.
533  *
534  * This function looks back pitch +/- 3 samples back into history to find
535  * the best fitting curve (that one giving the optimal gain of the two
536  * signals, i.e. the highest dot product between the two), and then
537  * uses that signal history to smoothen the output of the speech synthesis
538  * filter.
539  *
540  * @param s WMA Voice decoding context
541  * @param pitch pitch of the speech signal
542  * @param in input speech signal
543  * @param out output pointer for smoothened signal
544  * @param size input/output buffer size
545  *
546  * @returns -1 if no smoothening took place, e.g. because no optimal
547  * fit could be found, or 0 on success.
548  */
549 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
550  const float *in, float *out, int size)
551 {
552  int n;
553  float optimal_gain = 0, dot;
554  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
555  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
556  *best_hist_ptr = NULL;
557 
558  /* find best fitting point in history */
559  do {
560  dot = avpriv_scalarproduct_float_c(in, ptr, size);
561  if (dot > optimal_gain) {
562  optimal_gain = dot;
563  best_hist_ptr = ptr;
564  }
565  } while (--ptr >= end);
566 
567  if (optimal_gain <= 0)
568  return -1;
569  dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
570  if (dot <= 0) // would be 1.0
571  return -1;
572 
573  if (optimal_gain <= dot) {
574  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
575  } else
576  dot = 0.625;
577 
578  /* actual smoothing */
579  for (n = 0; n < size; n++)
580  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
581 
582  return 0;
583 }
584 
585 /**
586  * Get the tilt factor of a formant filter from its transfer function
587  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
588  * but somehow (??) it does a speech synthesis filter in the
589  * middle, which is missing here
590  *
591  * @param lpcs LPC coefficients
592  * @param n_lpcs Size of LPC buffer
593  * @returns the tilt factor
594  */
595 static float tilt_factor(const float *lpcs, int n_lpcs)
596 {
597  float rh0, rh1;
598 
599  rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
600  rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
601 
602  return rh1 / rh0;
603 }
604 
605 /**
606  * Derive denoise filter coefficients (in real domain) from the LPCs.
607  */
608 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
609  int fcb_type, float *coeffs, int remainder)
610 {
611  float last_coeff, min = 15.0, max = -15.0;
612  float irange, angle_mul, gain_mul, range, sq;
613  int n, idx;
614 
615  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
616  s->rdft.rdft_calc(&s->rdft, lpcs);
617 #define log_range(var, assign) do { \
618  float tmp = log10f(assign); var = tmp; \
619  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
620  } while (0)
621  log_range(last_coeff, lpcs[1] * lpcs[1]);
622  for (n = 1; n < 64; n++)
623  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
624  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
625  log_range(lpcs[0], lpcs[0] * lpcs[0]);
626 #undef log_range
627  range = max - min;
628  lpcs[64] = last_coeff;
629 
630  /* Now, use this spectrum to pick out these frequencies with higher
631  * (relative) power/energy (which we then take to be "not noise"),
632  * and set up a table (still in lpc[]) of (relative) gains per frequency.
633  * These frequencies will be maintained, while others ("noise") will be
634  * decreased in the filter output. */
635  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
636  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
637  (5.0 / 14.7));
638  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
639  for (n = 0; n <= 64; n++) {
640  float pwr;
641 
642  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
644  lpcs[n] = angle_mul * pwr;
645 
646  /* 70.57 =~ 1/log10(1.0331663) */
647  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
648  if (idx > 127) { // fall back if index falls outside table range
649  coeffs[n] = wmavoice_energy_table[127] *
650  powf(1.0331663, idx - 127);
651  } else
652  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
653  }
654 
655  /* calculate the Hilbert transform of the gains, which we do (since this
656  * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
657  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
658  * "moment" of the LPCs in this filter. */
659  s->dct.dct_calc(&s->dct, lpcs);
660  s->dst.dct_calc(&s->dst, lpcs);
661 
662  /* Split out the coefficient indexes into phase/magnitude pairs */
663  idx = 255 + av_clip(lpcs[64], -255, 255);
664  coeffs[0] = coeffs[0] * s->cos[idx];
665  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
666  last_coeff = coeffs[64] * s->cos[idx];
667  for (n = 63;; n--) {
668  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
669  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
670  coeffs[n * 2] = coeffs[n] * s->cos[idx];
671 
672  if (!--n) break;
673 
674  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
675  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
676  coeffs[n * 2] = coeffs[n] * s->cos[idx];
677  }
678  coeffs[1] = last_coeff;
679 
680  /* move into real domain */
681  s->irdft.rdft_calc(&s->irdft, coeffs);
682 
683  /* tilt correction and normalize scale */
684  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
685  if (s->denoise_tilt_corr) {
686  float tilt_mem = 0;
687 
688  coeffs[remainder - 1] = 0;
689  ff_tilt_compensation(&tilt_mem,
690  -1.8 * tilt_factor(coeffs, remainder - 1),
691  coeffs, remainder);
692  }
693  sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
694  remainder));
695  for (n = 0; n < remainder; n++)
696  coeffs[n] *= sq;
697 }
698 
699 /**
700  * This function applies a Wiener filter on the (noisy) speech signal as
701  * a means to denoise it.
702  *
703  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
704  * - using this power spectrum, calculate (for each frequency) the Wiener
705  * filter gain, which depends on the frequency power and desired level
706  * of noise subtraction (when set too high, this leads to artifacts)
707  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
708  * of 4-8kHz);
709  * - by doing a phase shift, calculate the Hilbert transform of this array
710  * of per-frequency filter-gains to get the filtering coefficients;
711  * - smoothen/normalize/de-tilt these filter coefficients as desired;
712  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
713  * to get the denoised speech signal;
714  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
715  * the frame boundary) are saved and applied to subsequent frames by an
716  * overlap-add method (otherwise you get clicking-artifacts).
717  *
718  * @param s WMA Voice decoding context
719  * @param fcb_type Frame (codebook) type
720  * @param synth_pf input: the noisy speech signal, output: denoised speech
721  * data; should be 16-byte aligned (for ASM purposes)
722  * @param size size of the speech data
723  * @param lpcs LPCs used to synthesize this frame's speech data
724  */
725 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
726  float *synth_pf, int size,
727  const float *lpcs)
728 {
729  int remainder, lim, n;
730 
731  if (fcb_type != FCB_TYPE_SILENCE) {
732  float *tilted_lpcs = s->tilted_lpcs_pf,
733  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
734 
735  tilted_lpcs[0] = 1.0;
736  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
737  memset(&tilted_lpcs[s->lsps + 1], 0,
738  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
739  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
740  tilted_lpcs, s->lsps + 2);
741 
742  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
743  * size is applied to the next frame. All input beyond this is zero,
744  * and thus all output beyond this will go towards zero, hence we can
745  * limit to min(size-1, 127-size) as a performance consideration. */
746  remainder = FFMIN(127 - size, size - 1);
747  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
748 
749  /* apply coefficients (in frequency spectrum domain), i.e. complex
750  * number multiplication */
751  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
752  s->rdft.rdft_calc(&s->rdft, synth_pf);
753  s->rdft.rdft_calc(&s->rdft, coeffs);
754  synth_pf[0] *= coeffs[0];
755  synth_pf[1] *= coeffs[1];
756  for (n = 1; n < 64; n++) {
757  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
758  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
759  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
760  }
761  s->irdft.rdft_calc(&s->irdft, synth_pf);
762  }
763 
764  /* merge filter output with the history of previous runs */
765  if (s->denoise_filter_cache_size) {
766  lim = FFMIN(s->denoise_filter_cache_size, size);
767  for (n = 0; n < lim; n++)
768  synth_pf[n] += s->denoise_filter_cache[n];
769  s->denoise_filter_cache_size -= lim;
770  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
772  }
773 
774  /* move remainder of filter output into a cache for future runs */
775  if (fcb_type != FCB_TYPE_SILENCE) {
776  lim = FFMIN(remainder, s->denoise_filter_cache_size);
777  for (n = 0; n < lim; n++)
778  s->denoise_filter_cache[n] += synth_pf[size + n];
779  if (lim < remainder) {
780  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
781  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
782  s->denoise_filter_cache_size = remainder;
783  }
784  }
785 }
786 
787 /**
788  * Averaging projection filter, the postfilter used in WMAVoice.
789  *
790  * This uses the following steps:
791  * - A zero-synthesis filter (generate excitation from synth signal)
792  * - Kalman smoothing on excitation, based on pitch
793  * - Re-synthesized smoothened output
794  * - Iterative Wiener denoise filter
795  * - Adaptive gain filter
796  * - DC filter
797  *
798  * @param s WMAVoice decoding context
799  * @param synth Speech synthesis output (before postfilter)
800  * @param samples Output buffer for filtered samples
801  * @param size Buffer size of synth & samples
802  * @param lpcs Generated LPCs used for speech synthesis
803  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
804  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
805  * @param pitch Pitch of the input signal
806  */
807 static void postfilter(WMAVoiceContext *s, const float *synth,
808  float *samples, int size,
809  const float *lpcs, float *zero_exc_pf,
810  int fcb_type, int pitch)
811 {
812  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
813  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
814  *synth_filter_in = zero_exc_pf;
815 
816  av_assert0(size <= MAX_FRAMESIZE / 2);
817 
818  /* generate excitation from input signal */
819  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
820 
821  if (fcb_type >= FCB_TYPE_AW_PULSES &&
822  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
823  synth_filter_in = synth_filter_in_buf;
824 
825  /* re-synthesize speech after smoothening, and keep history */
826  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
827  synth_filter_in, size, s->lsps);
828  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
829  sizeof(synth_pf[0]) * s->lsps);
830 
831  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
832 
833  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
834  &s->postfilter_agc);
835 
836  if (s->dc_level > 8) {
837  /* remove ultra-low frequency DC noise / highpass filter;
838  * coefficients are identical to those used in SIPR decoding,
839  * and very closely resemble those used in AMR-NB decoding. */
841  (const float[2]) { -1.99997, 1.0 },
842  (const float[2]) { -1.9330735188, 0.93589198496 },
843  0.93980580475, s->dcf_mem, size);
844  }
845 }
846 /**
847  * @}
848  */
849 
850 /**
851  * Dequantize LSPs
852  * @param lsps output pointer to the array that will hold the LSPs
853  * @param num number of LSPs to be dequantized
854  * @param values quantized values, contains n_stages values
855  * @param sizes range (i.e. max value) of each quantized value
856  * @param n_stages number of dequantization runs
857  * @param table dequantization table to be used
858  * @param mul_q LSF multiplier
859  * @param base_q base (lowest) LSF values
860  */
861 static void dequant_lsps(double *lsps, int num,
862  const uint16_t *values,
863  const uint16_t *sizes,
864  int n_stages, const uint8_t *table,
865  const double *mul_q,
866  const double *base_q)
867 {
868  int n, m;
869 
870  memset(lsps, 0, num * sizeof(*lsps));
871  for (n = 0; n < n_stages; n++) {
872  const uint8_t *t_off = &table[values[n] * num];
873  double base = base_q[n], mul = mul_q[n];
874 
875  for (m = 0; m < num; m++)
876  lsps[m] += base + mul * t_off[m];
877 
878  table += sizes[n] * num;
879  }
880 }
881 
882 /**
883  * @name LSP dequantization routines
884  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
885  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
886  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
887  * @{
888  */
889 /**
890  * Parse 10 independently-coded LSPs.
891  */
892 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
893 {
894  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
895  static const double mul_lsf[4] = {
896  5.2187144800e-3, 1.4626986422e-3,
897  9.6179549166e-4, 1.1325736225e-3
898  };
899  static const double base_lsf[4] = {
900  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
901  M_PI * -3.3486e-2, M_PI * -5.7408e-2
902  };
903  uint16_t v[4];
904 
905  v[0] = get_bits(gb, 8);
906  v[1] = get_bits(gb, 6);
907  v[2] = get_bits(gb, 5);
908  v[3] = get_bits(gb, 5);
909 
910  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
911  mul_lsf, base_lsf);
912 }
913 
914 /**
915  * Parse 10 independently-coded LSPs, and then derive the tables to
916  * generate LSPs for the other frames from them (residual coding).
917  */
919  double *i_lsps, const double *old,
920  double *a1, double *a2, int q_mode)
921 {
922  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
923  static const double mul_lsf[3] = {
924  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
925  };
926  static const double base_lsf[3] = {
927  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
928  };
929  const float (*ipol_tab)[2][10] = q_mode ?
931  uint16_t interpol, v[3];
932  int n;
933 
934  dequant_lsp10i(gb, i_lsps);
935 
936  interpol = get_bits(gb, 5);
937  v[0] = get_bits(gb, 7);
938  v[1] = get_bits(gb, 6);
939  v[2] = get_bits(gb, 6);
940 
941  for (n = 0; n < 10; n++) {
942  double delta = old[n] - i_lsps[n];
943  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
944  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
945  }
946 
947  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
948  mul_lsf, base_lsf);
949 }
950 
951 /**
952  * Parse 16 independently-coded LSPs.
953  */
954 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
955 {
956  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
957  static const double mul_lsf[5] = {
958  3.3439586280e-3, 6.9908173703e-4,
959  3.3216608306e-3, 1.0334960326e-3,
960  3.1899104283e-3
961  };
962  static const double base_lsf[5] = {
963  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
964  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
965  M_PI * -1.29816e-1
966  };
967  uint16_t v[5];
968 
969  v[0] = get_bits(gb, 8);
970  v[1] = get_bits(gb, 6);
971  v[2] = get_bits(gb, 7);
972  v[3] = get_bits(gb, 6);
973  v[4] = get_bits(gb, 7);
974 
975  dequant_lsps( lsps, 5, v, vec_sizes, 2,
976  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
977  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
978  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
979  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
980  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
981 }
982 
983 /**
984  * Parse 16 independently-coded LSPs, and then derive the tables to
985  * generate LSPs for the other frames from them (residual coding).
986  */
988  double *i_lsps, const double *old,
989  double *a1, double *a2, int q_mode)
990 {
991  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
992  static const double mul_lsf[3] = {
993  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
994  };
995  static const double base_lsf[3] = {
996  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
997  };
998  const float (*ipol_tab)[2][16] = q_mode ?
1000  uint16_t interpol, v[3];
1001  int n;
1002 
1003  dequant_lsp16i(gb, i_lsps);
1004 
1005  interpol = get_bits(gb, 5);
1006  v[0] = get_bits(gb, 7);
1007  v[1] = get_bits(gb, 7);
1008  v[2] = get_bits(gb, 7);
1009 
1010  for (n = 0; n < 16; n++) {
1011  double delta = old[n] - i_lsps[n];
1012  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1013  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1014  }
1015 
1016  dequant_lsps( a2, 10, v, vec_sizes, 1,
1017  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
1018  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1019  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1020  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1021  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1022 }
1023 
1024 /**
1025  * @}
1026  * @name Pitch-adaptive window coding functions
1027  * The next few functions are for pitch-adaptive window coding.
1028  * @{
1029  */
1030 /**
1031  * Parse the offset of the first pitch-adaptive window pulses, and
1032  * the distribution of pulses between the two blocks in this frame.
1033  * @param s WMA Voice decoding context private data
1034  * @param gb bit I/O context
1035  * @param pitch pitch for each block in this frame
1036  */
1038  const int *pitch)
1039 {
1040  static const int16_t start_offset[94] = {
1041  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1042  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1043  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1044  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1045  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1046  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1047  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1048  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1049  };
1050  int bits, offset;
1051 
1052  /* position of pulse */
1053  s->aw_idx_is_ext = 0;
1054  if ((bits = get_bits(gb, 6)) >= 54) {
1055  s->aw_idx_is_ext = 1;
1056  bits += (bits - 54) * 3 + get_bits(gb, 2);
1057  }
1058 
1059  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1060  * the distribution of the pulses in each block contained in this frame. */
1061  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1062  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1063  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1064  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1065  offset += s->aw_n_pulses[0] * pitch[0];
1066  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1067  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1068 
1069  /* if continuing from a position before the block, reset position to
1070  * start of block (when corrected for the range over which it can be
1071  * spread in aw_pulse_set1()). */
1072  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1073  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1074  s->aw_first_pulse_off[1] -= pitch[1];
1075  if (start_offset[bits] < 0)
1076  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1077  s->aw_first_pulse_off[0] -= pitch[0];
1078  }
1079 }
1080 
1081 /**
1082  * Apply second set of pitch-adaptive window pulses.
1083  * @param s WMA Voice decoding context private data
1084  * @param gb bit I/O context
1085  * @param block_idx block index in frame [0, 1]
1086  * @param fcb structure containing fixed codebook vector info
1087  * @return -1 on error, 0 otherwise
1088  */
1090  int block_idx, AMRFixed *fcb)
1091 {
1092  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1093  uint16_t *use_mask = use_mask_mem + 2;
1094  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1095  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1096  * of idx are the position of the bit within a particular item in the
1097  * array (0 being the most significant bit, and 15 being the least
1098  * significant bit), and the remainder (>> 4) is the index in the
1099  * use_mask[]-array. This is faster and uses less memory than using a
1100  * 80-byte/80-int array. */
1101  int pulse_off = s->aw_first_pulse_off[block_idx],
1102  pulse_start, n, idx, range, aidx, start_off = 0;
1103 
1104  /* set offset of first pulse to within this block */
1105  if (s->aw_n_pulses[block_idx] > 0)
1106  while (pulse_off + s->aw_pulse_range < 1)
1107  pulse_off += fcb->pitch_lag;
1108 
1109  /* find range per pulse */
1110  if (s->aw_n_pulses[0] > 0) {
1111  if (block_idx == 0) {
1112  range = 32;
1113  } else /* block_idx = 1 */ {
1114  range = 8;
1115  if (s->aw_n_pulses[block_idx] > 0)
1116  pulse_off = s->aw_next_pulse_off_cache;
1117  }
1118  } else
1119  range = 16;
1120  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1121 
1122  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1123  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1124  * we exclude that range from being pulsed again in this function. */
1125  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1126  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1127  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1128  if (s->aw_n_pulses[block_idx] > 0)
1129  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1130  int excl_range = s->aw_pulse_range; // always 16 or 24
1131  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1132  int first_sh = 16 - (idx & 15);
1133  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1134  excl_range -= first_sh;
1135  if (excl_range >= 16) {
1136  *use_mask_ptr++ = 0;
1137  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1138  } else
1139  *use_mask_ptr &= 0xFFFF >> excl_range;
1140  }
1141 
1142  /* find the 'aidx'th offset that is not excluded */
1143  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1144  for (n = 0; n <= aidx; pulse_start++) {
1145  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1146  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1147  if (use_mask[0]) idx = 0x0F;
1148  else if (use_mask[1]) idx = 0x1F;
1149  else if (use_mask[2]) idx = 0x2F;
1150  else if (use_mask[3]) idx = 0x3F;
1151  else if (use_mask[4]) idx = 0x4F;
1152  else return -1;
1153  idx -= av_log2_16bit(use_mask[idx >> 4]);
1154  }
1155  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1156  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1157  n++;
1158  start_off = idx;
1159  }
1160  }
1161 
1162  fcb->x[fcb->n] = start_off;
1163  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1164  fcb->n++;
1165 
1166  /* set offset for next block, relative to start of that block */
1167  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1168  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1169  return 0;
1170 }
1171 
1172 /**
1173  * Apply first set of pitch-adaptive window pulses.
1174  * @param s WMA Voice decoding context private data
1175  * @param gb bit I/O context
1176  * @param block_idx block index in frame [0, 1]
1177  * @param fcb storage location for fixed codebook pulse info
1178  */
1180  int block_idx, AMRFixed *fcb)
1181 {
1182  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1183  float v;
1184 
1185  if (s->aw_n_pulses[block_idx] > 0) {
1186  int n, v_mask, i_mask, sh, n_pulses;
1187 
1188  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1189  n_pulses = 3;
1190  v_mask = 8;
1191  i_mask = 7;
1192  sh = 4;
1193  } else { // 4 pulses, 1:sign + 2:index each
1194  n_pulses = 4;
1195  v_mask = 4;
1196  i_mask = 3;
1197  sh = 3;
1198  }
1199 
1200  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1201  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1202  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1203  s->aw_first_pulse_off[block_idx];
1204  while (fcb->x[fcb->n] < 0)
1205  fcb->x[fcb->n] += fcb->pitch_lag;
1206  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1207  fcb->n++;
1208  }
1209  } else {
1210  int num2 = (val & 0x1FF) >> 1, delta, idx;
1211 
1212  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1213  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1214  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1215  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1216  v = (val & 0x200) ? -1.0 : 1.0;
1217 
1218  fcb->no_repeat_mask |= 3 << fcb->n;
1219  fcb->x[fcb->n] = idx - delta;
1220  fcb->y[fcb->n] = v;
1221  fcb->x[fcb->n + 1] = idx;
1222  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1223  fcb->n += 2;
1224  }
1225 }
1226 
1227 /**
1228  * @}
1229  *
1230  * Generate a random number from frame_cntr and block_idx, which will live
1231  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1232  * table of size 1000 of which you want to read block_size entries).
1233  *
1234  * @param frame_cntr current frame number
1235  * @param block_num current block index
1236  * @param block_size amount of entries we want to read from a table
1237  * that has 1000 entries
1238  * @return a (non-)random number in the [0, 1000 - block_size] range.
1239  */
1240 static int pRNG(int frame_cntr, int block_num, int block_size)
1241 {
1242  /* array to simplify the calculation of z:
1243  * y = (x % 9) * 5 + 6;
1244  * z = (49995 * x) / y;
1245  * Since y only has 9 values, we can remove the division by using a
1246  * LUT and using FASTDIV-style divisions. For each of the 9 values
1247  * of y, we can rewrite z as:
1248  * z = x * (49995 / y) + x * ((49995 % y) / y)
1249  * In this table, each col represents one possible value of y, the
1250  * first number is 49995 / y, and the second is the FASTDIV variant
1251  * of 49995 % y / y. */
1252  static const unsigned int div_tbl[9][2] = {
1253  { 8332, 3 * 715827883U }, // y = 6
1254  { 4545, 0 * 390451573U }, // y = 11
1255  { 3124, 11 * 268435456U }, // y = 16
1256  { 2380, 15 * 204522253U }, // y = 21
1257  { 1922, 23 * 165191050U }, // y = 26
1258  { 1612, 23 * 138547333U }, // y = 31
1259  { 1388, 27 * 119304648U }, // y = 36
1260  { 1219, 16 * 104755300U }, // y = 41
1261  { 1086, 39 * 93368855U } // y = 46
1262  };
1263  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1264  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1265  // so this is effectively a modulo (%)
1266  y = x - 9 * MULH(477218589, x); // x % 9
1267  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1268  // z = x * 49995 / (y * 5 + 6)
1269  return z % (1000 - block_size);
1270 }
1271 
1272 /**
1273  * Parse hardcoded signal for a single block.
1274  * @note see #synth_block().
1275  */
1277  int block_idx, int size,
1278  const struct frame_type_desc *frame_desc,
1279  float *excitation)
1280 {
1281  float gain;
1282  int n, r_idx;
1283 
1284  av_assert0(size <= MAX_FRAMESIZE);
1285 
1286  /* Set the offset from which we start reading wmavoice_std_codebook */
1287  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1288  r_idx = pRNG(s->frame_cntr, block_idx, size);
1289  gain = s->silence_gain;
1290  } else /* FCB_TYPE_HARDCODED */ {
1291  r_idx = get_bits(gb, 8);
1292  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1293  }
1294 
1295  /* Clear gain prediction parameters */
1296  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1297 
1298  /* Apply gain to hardcoded codebook and use that as excitation signal */
1299  for (n = 0; n < size; n++)
1300  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1301 }
1302 
1303 /**
1304  * Parse FCB/ACB signal for a single block.
1305  * @note see #synth_block().
1306  */
1308  int block_idx, int size,
1309  int block_pitch_sh2,
1310  const struct frame_type_desc *frame_desc,
1311  float *excitation)
1312 {
1313  static const float gain_coeff[6] = {
1314  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1315  };
1316  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1317  int n, idx, gain_weight;
1318  AMRFixed fcb;
1319 
1320  av_assert0(size <= MAX_FRAMESIZE / 2);
1321  memset(pulses, 0, sizeof(*pulses) * size);
1322 
1323  fcb.pitch_lag = block_pitch_sh2 >> 2;
1324  fcb.pitch_fac = 1.0;
1325  fcb.no_repeat_mask = 0;
1326  fcb.n = 0;
1327 
1328  /* For the other frame types, this is where we apply the innovation
1329  * (fixed) codebook pulses of the speech signal. */
1330  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1331  aw_pulse_set1(s, gb, block_idx, &fcb);
1332  if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1333  /* Conceal the block with silence and return.
1334  * Skip the correct amount of bits to read the next
1335  * block from the correct offset. */
1336  int r_idx = pRNG(s->frame_cntr, block_idx, size);
1337 
1338  for (n = 0; n < size; n++)
1339  excitation[n] =
1340  wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1341  skip_bits(gb, 7 + 1);
1342  return;
1343  }
1344  } else /* FCB_TYPE_EXC_PULSES */ {
1345  int offset_nbits = 5 - frame_desc->log_n_blocks;
1346 
1347  fcb.no_repeat_mask = -1;
1348  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1349  * (instead of double) for a subset of pulses */
1350  for (n = 0; n < 5; n++) {
1351  float sign;
1352  int pos1, pos2;
1353 
1354  sign = get_bits1(gb) ? 1.0 : -1.0;
1355  pos1 = get_bits(gb, offset_nbits);
1356  fcb.x[fcb.n] = n + 5 * pos1;
1357  fcb.y[fcb.n++] = sign;
1358  if (n < frame_desc->dbl_pulses) {
1359  pos2 = get_bits(gb, offset_nbits);
1360  fcb.x[fcb.n] = n + 5 * pos2;
1361  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1362  }
1363  }
1364  }
1365  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1366 
1367  /* Calculate gain for adaptive & fixed codebook signal.
1368  * see ff_amr_set_fixed_gain(). */
1369  idx = get_bits(gb, 7);
1371  gain_coeff, 6) -
1372  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1373  acb_gain = wmavoice_gain_codebook_acb[idx];
1374  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1375  -2.9957322736 /* log(0.05) */,
1376  1.6094379124 /* log(5.0) */);
1377 
1378  gain_weight = 8 >> frame_desc->log_n_blocks;
1379  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1380  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1381  for (n = 0; n < gain_weight; n++)
1382  s->gain_pred_err[n] = pred_err;
1383 
1384  /* Calculation of adaptive codebook */
1385  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1386  int len;
1387  for (n = 0; n < size; n += len) {
1388  int next_idx_sh16;
1389  int abs_idx = block_idx * size + n;
1390  int pitch_sh16 = (s->last_pitch_val << 16) +
1391  s->pitch_diff_sh16 * abs_idx;
1392  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1393  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1394  idx = idx_sh16 >> 16;
1395  if (s->pitch_diff_sh16) {
1396  if (s->pitch_diff_sh16 > 0) {
1397  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1398  } else
1399  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1400  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1401  1, size - n);
1402  } else
1403  len = size;
1404 
1405  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1407  idx, 9, len);
1408  }
1409  } else /* ACB_TYPE_HAMMING */ {
1410  int block_pitch = block_pitch_sh2 >> 2;
1411  idx = block_pitch_sh2 & 3;
1412  if (idx) {
1413  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1415  idx, 8, size);
1416  } else
1417  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1418  sizeof(float) * size);
1419  }
1420 
1421  /* Interpolate ACB/FCB and use as excitation signal */
1422  ff_weighted_vector_sumf(excitation, excitation, pulses,
1423  acb_gain, fcb_gain, size);
1424 }
1425 
1426 /**
1427  * Parse data in a single block.
1428  *
1429  * @param s WMA Voice decoding context private data
1430  * @param gb bit I/O context
1431  * @param block_idx index of the to-be-read block
1432  * @param size amount of samples to be read in this block
1433  * @param block_pitch_sh2 pitch for this block << 2
1434  * @param lsps LSPs for (the end of) this frame
1435  * @param prev_lsps LSPs for the last frame
1436  * @param frame_desc frame type descriptor
1437  * @param excitation target memory for the ACB+FCB interpolated signal
1438  * @param synth target memory for the speech synthesis filter output
1439  * @return 0 on success, <0 on error.
1440  */
1442  int block_idx, int size,
1443  int block_pitch_sh2,
1444  const double *lsps, const double *prev_lsps,
1445  const struct frame_type_desc *frame_desc,
1446  float *excitation, float *synth)
1447 {
1448  double i_lsps[MAX_LSPS];
1449  float lpcs[MAX_LSPS];
1450  float fac;
1451  int n;
1452 
1453  if (frame_desc->acb_type == ACB_TYPE_NONE)
1454  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1455  else
1456  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1457  frame_desc, excitation);
1458 
1459  /* convert interpolated LSPs to LPCs */
1460  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1461  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1462  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1463  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1464 
1465  /* Speech synthesis */
1466  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1467 }
1468 
1469 /**
1470  * Synthesize output samples for a single frame.
1471  *
1472  * @param ctx WMA Voice decoder context
1473  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1474  * @param frame_idx Frame number within superframe [0-2]
1475  * @param samples pointer to output sample buffer, has space for at least 160
1476  * samples
1477  * @param lsps LSP array
1478  * @param prev_lsps array of previous frame's LSPs
1479  * @param excitation target buffer for excitation signal
1480  * @param synth target buffer for synthesized speech data
1481  * @return 0 on success, <0 on error.
1482  */
1483 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1484  float *samples,
1485  const double *lsps, const double *prev_lsps,
1486  float *excitation, float *synth)
1487 {
1488  WMAVoiceContext *s = ctx->priv_data;
1489  int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1490  int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1491 
1492  /* Parse frame type ("frame header"), see frame_descs */
1493  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1494 
1495  if (bd_idx < 0) {
1496  av_log(ctx, AV_LOG_ERROR,
1497  "Invalid frame type VLC code, skipping\n");
1498  return AVERROR_INVALIDDATA;
1499  }
1500 
1501  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1502 
1503  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1504  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1505  /* Pitch is provided per frame, which is interpreted as the pitch of
1506  * the last sample of the last block of this frame. We can interpolate
1507  * the pitch of other blocks (and even pitch-per-sample) by gradually
1508  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1509  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1510  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1511  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1512  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1513  if (s->last_acb_type == ACB_TYPE_NONE ||
1514  20 * abs(cur_pitch_val - s->last_pitch_val) >
1515  (cur_pitch_val + s->last_pitch_val))
1516  s->last_pitch_val = cur_pitch_val;
1517 
1518  /* pitch per block */
1519  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1520  int fac = n * 2 + 1;
1521 
1522  pitch[n] = (MUL16(fac, cur_pitch_val) +
1523  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1524  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1525  }
1526 
1527  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1528  s->pitch_diff_sh16 =
1529  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1530  }
1531 
1532  /* Global gain (if silence) and pitch-adaptive window coordinates */
1533  switch (frame_descs[bd_idx].fcb_type) {
1534  case FCB_TYPE_SILENCE:
1536  break;
1537  case FCB_TYPE_AW_PULSES:
1538  aw_parse_coords(s, gb, pitch);
1539  break;
1540  }
1541 
1542  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1543  int bl_pitch_sh2;
1544 
1545  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1546  switch (frame_descs[bd_idx].acb_type) {
1547  case ACB_TYPE_HAMMING: {
1548  /* Pitch is given per block. Per-block pitches are encoded as an
1549  * absolute value for the first block, and then delta values
1550  * relative to this value) for all subsequent blocks. The scale of
1551  * this pitch value is semi-logarithmic compared to its use in the
1552  * decoder, so we convert it to normal scale also. */
1553  int block_pitch,
1554  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1555  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1556  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1557 
1558  if (n == 0) {
1559  block_pitch = get_bits(gb, s->block_pitch_nbits);
1560  } else
1561  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1563  /* Convert last_ so that any next delta is within _range */
1564  last_block_pitch = av_clip(block_pitch,
1566  s->block_pitch_range -
1568 
1569  /* Convert semi-log-style scale back to normal scale */
1570  if (block_pitch < t1) {
1571  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1572  } else {
1573  block_pitch -= t1;
1574  if (block_pitch < t2) {
1575  bl_pitch_sh2 =
1576  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1577  } else {
1578  block_pitch -= t2;
1579  if (block_pitch < t3) {
1580  bl_pitch_sh2 =
1581  (s->block_conv_table[2] + block_pitch) << 2;
1582  } else
1583  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1584  }
1585  }
1586  pitch[n] = bl_pitch_sh2 >> 2;
1587  break;
1588  }
1589 
1590  case ACB_TYPE_ASYMMETRIC: {
1591  bl_pitch_sh2 = pitch[n] << 2;
1592  break;
1593  }
1594 
1595  default: // ACB_TYPE_NONE has no pitch
1596  bl_pitch_sh2 = 0;
1597  break;
1598  }
1599 
1600  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1601  lsps, prev_lsps, &frame_descs[bd_idx],
1602  &excitation[n * block_nsamples],
1603  &synth[n * block_nsamples]);
1604  }
1605 
1606  /* Averaging projection filter, if applicable. Else, just copy samples
1607  * from synthesis buffer */
1608  if (s->do_apf) {
1609  double i_lsps[MAX_LSPS];
1610  float lpcs[MAX_LSPS];
1611 
1612  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1613  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1614  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1615  postfilter(s, synth, samples, 80, lpcs,
1616  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1617  frame_descs[bd_idx].fcb_type, pitch[0]);
1618 
1619  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1620  i_lsps[n] = cos(lsps[n]);
1621  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1622  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1623  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1624  frame_descs[bd_idx].fcb_type, pitch[0]);
1625  } else
1626  memcpy(samples, synth, 160 * sizeof(synth[0]));
1627 
1628  /* Cache values for next frame */
1629  s->frame_cntr++;
1630  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1631  s->last_acb_type = frame_descs[bd_idx].acb_type;
1632  switch (frame_descs[bd_idx].acb_type) {
1633  case ACB_TYPE_NONE:
1634  s->last_pitch_val = 0;
1635  break;
1636  case ACB_TYPE_ASYMMETRIC:
1637  s->last_pitch_val = cur_pitch_val;
1638  break;
1639  case ACB_TYPE_HAMMING:
1640  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1641  break;
1642  }
1643 
1644  return 0;
1645 }
1646 
1647 /**
1648  * Ensure minimum value for first item, maximum value for last value,
1649  * proper spacing between each value and proper ordering.
1650  *
1651  * @param lsps array of LSPs
1652  * @param num size of LSP array
1653  *
1654  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1655  * useful to put in a generic location later on. Parts are also
1656  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1657  * which is in float.
1658  */
1659 static void stabilize_lsps(double *lsps, int num)
1660 {
1661  int n, m, l;
1662 
1663  /* set minimum value for first, maximum value for last and minimum
1664  * spacing between LSF values.
1665  * Very similar to ff_set_min_dist_lsf(), but in double. */
1666  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1667  for (n = 1; n < num; n++)
1668  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1669  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1670 
1671  /* reorder (looks like one-time / non-recursed bubblesort).
1672  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1673  for (n = 1; n < num; n++) {
1674  if (lsps[n] < lsps[n - 1]) {
1675  for (m = 1; m < num; m++) {
1676  double tmp = lsps[m];
1677  for (l = m - 1; l >= 0; l--) {
1678  if (lsps[l] <= tmp) break;
1679  lsps[l + 1] = lsps[l];
1680  }
1681  lsps[l + 1] = tmp;
1682  }
1683  break;
1684  }
1685  }
1686 }
1687 
1688 /**
1689  * Synthesize output samples for a single superframe. If we have any data
1690  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1691  * in s->gb.
1692  *
1693  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1694  * to give a total of 480 samples per frame. See #synth_frame() for frame
1695  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1696  * (if these are globally specified for all frames (residually); they can
1697  * also be specified individually per-frame. See the s->has_residual_lsps
1698  * option), and can specify the number of samples encoded in this superframe
1699  * (if less than 480), usually used to prevent blanks at track boundaries.
1700  *
1701  * @param ctx WMA Voice decoder context
1702  * @return 0 on success, <0 on error or 1 if there was not enough data to
1703  * fully parse the superframe
1704  */
1706  int *got_frame_ptr)
1707 {
1708  WMAVoiceContext *s = ctx->priv_data;
1709  GetBitContext *gb = &s->gb, s_gb;
1710  int n, res, n_samples = MAX_SFRAMESIZE;
1711  double lsps[MAX_FRAMES][MAX_LSPS];
1712  const double *mean_lsf = s->lsps == 16 ?
1714  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1715  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1716  float *samples;
1717 
1718  memcpy(synth, s->synth_history,
1719  s->lsps * sizeof(*synth));
1720  memcpy(excitation, s->excitation_history,
1721  s->history_nsamples * sizeof(*excitation));
1722 
1723  if (s->sframe_cache_size > 0) {
1724  gb = &s_gb;
1726  s->sframe_cache_size = 0;
1727  }
1728 
1729  /* First bit is speech/music bit, it differentiates between WMAVoice
1730  * speech samples (the actual codec) and WMAVoice music samples, which
1731  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1732  * the wild yet. */
1733  if (!get_bits1(gb)) {
1734  avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1735  return AVERROR_PATCHWELCOME;
1736  }
1737 
1738  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1739  if (get_bits1(gb)) {
1740  if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1741  av_log(ctx, AV_LOG_ERROR,
1742  "Superframe encodes > %d samples (%d), not allowed\n",
1743  MAX_SFRAMESIZE, n_samples);
1744  return AVERROR_INVALIDDATA;
1745  }
1746  }
1747 
1748  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1749  if (s->has_residual_lsps) {
1750  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1751 
1752  for (n = 0; n < s->lsps; n++)
1753  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1754 
1755  if (s->lsps == 10) {
1756  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1757  } else /* s->lsps == 16 */
1758  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1759 
1760  for (n = 0; n < s->lsps; n++) {
1761  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1762  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1763  lsps[2][n] += mean_lsf[n];
1764  }
1765  for (n = 0; n < 3; n++)
1766  stabilize_lsps(lsps[n], s->lsps);
1767  }
1768 
1769  /* get output buffer */
1770  frame->nb_samples = MAX_SFRAMESIZE;
1771  if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1772  return res;
1773  frame->nb_samples = n_samples;
1774  samples = (float *)frame->data[0];
1775 
1776  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1777  for (n = 0; n < 3; n++) {
1778  if (!s->has_residual_lsps) {
1779  int m;
1780 
1781  if (s->lsps == 10) {
1782  dequant_lsp10i(gb, lsps[n]);
1783  } else /* s->lsps == 16 */
1784  dequant_lsp16i(gb, lsps[n]);
1785 
1786  for (m = 0; m < s->lsps; m++)
1787  lsps[n][m] += mean_lsf[m];
1788  stabilize_lsps(lsps[n], s->lsps);
1789  }
1790 
1791  if ((res = synth_frame(ctx, gb, n,
1792  &samples[n * MAX_FRAMESIZE],
1793  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1794  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1795  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1796  *got_frame_ptr = 0;
1797  return res;
1798  }
1799  }
1800 
1801  /* Statistics? FIXME - we don't check for length, a slight overrun
1802  * will be caught by internal buffer padding, and anything else
1803  * will be skipped, not read. */
1804  if (get_bits1(gb)) {
1805  res = get_bits(gb, 4);
1806  skip_bits(gb, 10 * (res + 1));
1807  }
1808 
1809  if (get_bits_left(gb) < 0) {
1810  wmavoice_flush(ctx);
1811  return AVERROR_INVALIDDATA;
1812  }
1813 
1814  *got_frame_ptr = 1;
1815 
1816  /* Update history */
1817  memcpy(s->prev_lsps, lsps[2],
1818  s->lsps * sizeof(*s->prev_lsps));
1819  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1820  s->lsps * sizeof(*synth));
1821  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1822  s->history_nsamples * sizeof(*excitation));
1823  if (s->do_apf)
1824  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1825  s->history_nsamples * sizeof(*s->zero_exc_pf));
1826 
1827  return 0;
1828 }
1829 
1830 /**
1831  * Parse the packet header at the start of each packet (input data to this
1832  * decoder).
1833  *
1834  * @param s WMA Voice decoding context private data
1835  * @return <0 on error, nb_superframes on success.
1836  */
1838 {
1839  GetBitContext *gb = &s->gb;
1840  unsigned int res, n_superframes = 0;
1841 
1842  skip_bits(gb, 4); // packet sequence number
1843  s->has_residual_lsps = get_bits1(gb);
1844  do {
1845  res = get_bits(gb, 6); // number of superframes per packet
1846  // (minus first one if there is spillover)
1847  n_superframes += res;
1848  } while (res == 0x3F);
1850 
1851  return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1852 }
1853 
1854 /**
1855  * Copy (unaligned) bits from gb/data/size to pb.
1856  *
1857  * @param pb target buffer to copy bits into
1858  * @param data source buffer to copy bits from
1859  * @param size size of the source data, in bytes
1860  * @param gb bit I/O context specifying the current position in the source.
1861  * data. This function might use this to align the bit position to
1862  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1863  * source data
1864  * @param nbits the amount of bits to copy from source to target
1865  *
1866  * @note after calling this function, the current position in the input bit
1867  * I/O context is undefined.
1868  */
1869 static void copy_bits(PutBitContext *pb,
1870  const uint8_t *data, int size,
1871  GetBitContext *gb, int nbits)
1872 {
1873  int rmn_bytes, rmn_bits;
1874 
1875  rmn_bits = rmn_bytes = get_bits_left(gb);
1876  if (rmn_bits < nbits)
1877  return;
1878  if (nbits > pb->size_in_bits - put_bits_count(pb))
1879  return;
1880  rmn_bits &= 7; rmn_bytes >>= 3;
1881  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1882  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1883  avpriv_copy_bits(pb, data + size - rmn_bytes,
1884  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1885 }
1886 
1887 /**
1888  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1889  * and we expect that the demuxer / application provides it to us as such
1890  * (else you'll probably get garbage as output). Every packet has a size of
1891  * ctx->block_align bytes, starts with a packet header (see
1892  * #parse_packet_header()), and then a series of superframes. Superframe
1893  * boundaries may exceed packets, i.e. superframes can split data over
1894  * multiple (two) packets.
1895  *
1896  * For more information about frames, see #synth_superframe().
1897  */
1899  int *got_frame_ptr, AVPacket *avpkt)
1900 {
1901  WMAVoiceContext *s = ctx->priv_data;
1902  GetBitContext *gb = &s->gb;
1903  int size, res, pos;
1904 
1905  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1906  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1907  * feeds us ASF packets, which may concatenate multiple "codec" packets
1908  * in a single "muxer" packet, so we artificially emulate that by
1909  * capping the packet size at ctx->block_align. */
1910  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1911  init_get_bits(&s->gb, avpkt->data, size << 3);
1912 
1913  /* size == ctx->block_align is used to indicate whether we are dealing with
1914  * a new packet or a packet of which we already read the packet header
1915  * previously. */
1916  if (!(size % ctx->block_align)) { // new packet header
1917  if (!size) {
1918  s->spillover_nbits = 0;
1919  s->nb_superframes = 0;
1920  } else {
1921  if ((res = parse_packet_header(s)) < 0)
1922  return res;
1923  s->nb_superframes = res;
1924  }
1925 
1926  /* If the packet header specifies a s->spillover_nbits, then we want
1927  * to push out all data of the previous packet (+ spillover) before
1928  * continuing to parse new superframes in the current packet. */
1929  if (s->sframe_cache_size > 0) {
1930  int cnt = get_bits_count(gb);
1931  if (cnt + s->spillover_nbits > avpkt->size * 8) {
1932  s->spillover_nbits = avpkt->size * 8 - cnt;
1933  }
1934  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1935  flush_put_bits(&s->pb);
1937  if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1938  *got_frame_ptr) {
1939  cnt += s->spillover_nbits;
1940  s->skip_bits_next = cnt & 7;
1941  res = cnt >> 3;
1942  return res;
1943  } else
1944  skip_bits_long (gb, s->spillover_nbits - cnt +
1945  get_bits_count(gb)); // resync
1946  } else if (s->spillover_nbits) {
1947  skip_bits_long(gb, s->spillover_nbits); // resync
1948  }
1949  } else if (s->skip_bits_next)
1950  skip_bits(gb, s->skip_bits_next);
1951 
1952  /* Try parsing superframes in current packet */
1953  s->sframe_cache_size = 0;
1954  s->skip_bits_next = 0;
1955  pos = get_bits_left(gb);
1956  if (s->nb_superframes-- == 0) {
1957  *got_frame_ptr = 0;
1958  return size;
1959  } else if (s->nb_superframes > 0) {
1960  if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1961  return res;
1962  } else if (*got_frame_ptr) {
1963  int cnt = get_bits_count(gb);
1964  s->skip_bits_next = cnt & 7;
1965  res = cnt >> 3;
1966  return res;
1967  }
1968  } else if ((s->sframe_cache_size = pos) > 0) {
1969  /* ... cache it for spillover in next packet */
1971  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1972  // FIXME bad - just copy bytes as whole and add use the
1973  // skip_bits_next field
1974  }
1975 
1976  return size;
1977 }
1978 
1980 {
1981  WMAVoiceContext *s = ctx->priv_data;
1982 
1983  if (s->do_apf) {
1984  ff_rdft_end(&s->rdft);
1985  ff_rdft_end(&s->irdft);
1986  ff_dct_end(&s->dct);
1987  ff_dct_end(&s->dst);
1988  }
1989 
1990  return 0;
1991 }
1992 
1994  .name = "wmavoice",
1995  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1996  .type = AVMEDIA_TYPE_AUDIO,
1997  .id = AV_CODEC_ID_WMAVOICE,
1998  .priv_data_size = sizeof(WMAVoiceContext),
2000  .init_static_data = wmavoice_init_static_data,
2001  .close = wmavoice_decode_end,
2004  .flush = wmavoice_flush,
2005 };
RDFTContext rdft
Definition: wmavoice.c:270
Description of frame types.
Definition: wmavoice.c:98
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply first set of pitch-adaptive window pulses.
Definition: wmavoice.c:1179
av_cold void ff_rdft_end(RDFTContext *s)
Definition: rdft.c:132
static const uint8_t wmavoice_dq_lsp16r2[0x500]
#define NULL
Definition: coverity.c:32
const char const char void * val
Definition: avisynth_c.h:771
int do_apf
whether to apply the averaged projection filter (APF)
Definition: wmavoice.c:150
hardcoded (fixed) codebook with per-block gain values
Definition: wmavoice.c:86
const char * s
Definition: avisynth_c.h:768
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:59
static int pRNG(int frame_cntr, int block_num, int block_size)
Generate a random number from frame_cntr and block_idx, which will live in the range [0...
Definition: wmavoice.c:1240
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
Set up the variable bit mode (VBM) tree from container extradata.
Definition: wmavoice.c:305
void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP synthesis filter.
Definition: celp_filters.c:84
float gain_pred_err[6]
cache for gain prediction
Definition: wmavoice.c:255
This structure describes decoded (raw) audio or video data.
Definition: frame.h:190
void(* dct_calc)(struct DCTContext *s, FFTSample *data)
Definition: dct.h:37
int aw_next_pulse_off_cache
the position (relative to start of the second block) at which pulses should start to be positioned...
Definition: wmavoice.c:246
int nb_superframes
number of superframes in current packet
Definition: wmavoice.c:254
int frame_lsp_bitsize
size (in bits) of LSPs, when encoded per-frame (independent coding)
Definition: wmavoice.c:163
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:101
static void flush(AVCodecContext *avctx)
float postfilter_agc
gain control memory, used in adaptive_gain_control()
Definition: wmavoice.c:276
void ff_acelp_apply_order_2_transfer_function(float *out, const float *in, const float zero_coeffs[2], const float pole_coeffs[2], float gain, float mem[2], int n)
Apply an order 2 rational transfer function in-place.
static void put_bits(Jpeg2000EncoderContext *s, int val, int n)
put n times val bit
Definition: j2kenc.c:206
static unsigned int get_bits(GetBitContext *s, int n)
Read 1-25 bits.
Definition: get_bits.h:261
static void postfilter(WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch)
Averaging projection filter, the postfilter used in WMAVoice.
Definition: wmavoice.c:807
Memory handling functions.
void ff_weighted_vector_sumf(float *out, const float *in_a, const float *in_b, float weight_coeff_a, float weight_coeff_b, int length)
float implementation of weighted sum of two vectors.
static void skip_bits_long(GetBitContext *s, int n)
Definition: get_bits.h:204
static av_cold int init(AVCodecContext *avctx)
Definition: avrndec.c:35
#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)
Definition: vlc.h:57
float synth_filter_out_buf[0x80+MAX_LSPS_ALIGN16]
aligned buffer for postfilter speech synthesis
Definition: wmavoice.c:288
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, const int *pitch)
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between th...
Definition: wmavoice.c:1037
static const int8_t pulses[4]
Number of non-zero pulses in the MP-MLQ excitation.
Definition: g723_1.h:720
int x[10]
Definition: acelp_vectors.h:55
int size
Definition: avcodec.h:1613
int aw_n_pulses[2]
number of AW-pulses in each block; note that this number can be negative (in which case it basically ...
Definition: wmavoice.c:241
static int interpol(MBContext *s, uint32_t *color, int x, int y, int linesize)
void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
Copy the content of src to the bitstream.
Definition: bitstream.c:65
static void stabilize_lsps(double *lsps, int num)
Ensure minimum value for first item, maximum value for last value, proper spacing between each value ...
Definition: wmavoice.c:1659
static const float wmavoice_gain_codebook_fcb[128]
static const uint8_t wmavoice_dq_lsp16i1[0x640]
#define a1
Definition: regdef.h:47
static const uint8_t wmavoice_dq_lsp16r1[0x500]
int spillover_nbits
number of bits of the previous packet's last superframe preceding this packet's first full superframe...
Definition: wmavoice.c:193
void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
Add fixed vector to an array from a sparse representation.
int block_pitch_nbits
number of bits used to specify the first block's pitch value
Definition: wmavoice.c:172
static const uint8_t wmavoice_dq_lsp16i3[0x300]
float pitch_fac
Definition: acelp_vectors.h:59
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth)
Synthesize output samples for a single frame.
Definition: wmavoice.c:1483
static void calc_input_response(WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder)
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition: wmavoice.c:608
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
Parse 10 independently-coded LSPs.
Definition: wmavoice.c:892
int av_log2_16bit(unsigned v)
Definition: intmath.c:31
AVCodec.
Definition: avcodec.h:3620
#define MAX_LSPS_ALIGN16
same as MAX_LSPS; needs to be multiple
Definition: wmavoice.c:48
int block_align
number of bytes per packet if constant and known or 0 Used by some WAV based audio codecs...
Definition: avcodec.h:2486
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply second set of pitch-adaptive window pulses.
Definition: wmavoice.c:1089
Pitch-adaptive window (AW) pulse signals, used in particular for low-bitrate streams.
Definition: wmavoice.c:88
static const float wmavoice_ipol1_coeffs[17 *9]
static const uint8_t wmavoice_dq_lsp16i2[0x3c0]
#define AV_CODEC_CAP_DELAY
Encoder or decoder requires flushing with NULL input at the end in order to give the complete and cor...
Definition: avcodec.h:989
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:37
int spillover_bitsize
number of bits used to specify spillover_nbits in the packet header = ceil(log2(ctx->block_align << 3...
Definition: wmavoice.c:143
float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len)
Return the scalar product of two vectors.
Definition: float_dsp.c:108
void void avpriv_request_sample(void *avc, const char *msg,...) av_printf_format(2
Log a generic warning message about a missing feature.
int block_delta_pitch_nbits
number of bits used to specify the delta pitch between this and the last block's pitch value...
Definition: wmavoice.c:175
uint8_t bits
Definition: crc.c:296
enum AVSampleFormat sample_fmt
audio sample format
Definition: avcodec.h:2457
int mem
Definition: avisynth_c.h:821
uint8_t
#define av_cold
Definition: attributes.h:82
Sparse representation for the algebraic codebook (fixed) vector.
Definition: acelp_vectors.h:53
static const uint8_t wmavoice_dq_lsp16r3[0x600]
float delta
DCTContext dct
Definition: wmavoice.c:272
static const float wmavoice_gain_codebook_acb[128]
uint8_t log_n_blocks
log2(n_blocks)
Definition: wmavoice.c:101
int aw_first_pulse_off[2]
index of first sample to which to apply AW-pulses, or -0xff if unset
Definition: wmavoice.c:244
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:90
int has_residual_lsps
if set, superframes contain one set of LSPs that cover all frames, encoded as independent and residua...
Definition: wmavoice.c:197
float tilted_lpcs_pf[0x80]
aligned buffer for LPC tilting
Definition: wmavoice.c:284
uint8_t * extradata
some codecs need / can use extradata like Huffman tables.
Definition: avcodec.h:1802
static float tilt_factor(const float *lpcs, int n_lpcs)
Get the tilt factor of a formant filter from its transfer function.
Definition: wmavoice.c:595
static const uint8_t wmavoice_dq_lsp10r[0x1400]
static AVFrame * frame
static void dequant_lsps(double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q)
Dequantize LSPs.
Definition: wmavoice.c:861
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:101
static const float wmavoice_ipol2_coeffs[32]
Hamming-window sinc function (num = 32, x = [ 0, 31 ]): (0.54 + 0.46 * cos(2 * M_PI * x / (num - 1)))...
uint8_t * data
Definition: avcodec.h:1612
static int get_bits_count(const GetBitContext *s)
Definition: get_bits.h:199
static int flags
Definition: log.c:57
float dcf_mem[2]
DC filter history.
Definition: wmavoice.c:278
void av_memcpy_backptr(uint8_t *dst, int back, int cnt)
Overlapping memcpy() implementation.
Definition: mem.c:430
bitstream reader API header.
static av_cold void wmavoice_flush(AVCodecContext *ctx)
Definition: wmavoice.c:341
float synth_history[MAX_LSPS]
see excitation_history
Definition: wmavoice.c:260
ptrdiff_t size
Definition: opengl_enc.c:101
double prev_lsps[MAX_LSPS]
LSPs of the last frame of the previous superframe.
Definition: wmavoice.c:225
static void copy_bits(PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits)
Copy (unaligned) bits from gb/data/size to pb.
Definition: wmavoice.c:1869
#define av_log(a,...)
#define expf(x)
Definition: libm.h:283
#define U(x)
Definition: vp56_arith.h:37
static int get_bits_left(GetBitContext *gb)
Definition: get_bits.h:587
int size_in_bits
Definition: put_bits.h:39
static double alpha(void *priv, double x, double y)
Definition: vf_geq.c:99
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
static const double wmavoice_mean_lsf16[2][16]
int sframe_cache_size
set to >0 if we have data from an (incomplete) superframe from a previous packet that spilled over in...
Definition: wmavoice.c:209
static const float wmavoice_lsp10_intercoeff_b[32][2][10]
no adaptive codebook (only hardcoded fixed)
Definition: wmavoice.c:68
int block_pitch_range
range of the block pitch
Definition: wmavoice.c:174
static const float wmavoice_std_codebook[1000]
static const int sizes[][2]
Definition: img2dec.c:50
int last_acb_type
frame type [0-2] of the previous frame
Definition: wmavoice.c:228
#define AVERROR(e)
Definition: error.h:43
static const struct endianess table[]
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
Definition: internal.h:176
static const float wmavoice_gain_silence[256]
int denoise_filter_cache_size
samples in denoise_filter_cache
Definition: wmavoice.c:283
int history_nsamples
number of samples in history for signal prediction (through ACB)
Definition: wmavoice.c:146
static const uint8_t wmavoice_dq_lsp10i[0xf00]
Definition: wmavoice_data.h:33
static const float wmavoice_lsp10_intercoeff_a[32][2][10]
#define t1
Definition: regdef.h:29
static const float wmavoice_energy_table[128]
LUT for 1.071575641632 * pow(1.0331663, n - 127)
Windows Media Voice (WMAVoice) tables.
Definition: avfft.h:73
const char * name
Name of the codec implementation.
Definition: avcodec.h:3627
int no_repeat_mask
Definition: acelp_vectors.h:57
int denoise_tilt_corr
Whether to apply tilt correction to the Wiener filter coefficients (postfilter)
Definition: wmavoice.c:154
int aw_idx_is_ext
whether the AW index was encoded in 8 bits (instead of 6)
Definition: wmavoice.c:233
#define t3
Definition: regdef.h:31
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
#define FFMAX(a, b)
Definition: common.h:94
uint16_t block_conv_table[4]
boundaries for block pitch unit/scale conversion
Definition: wmavoice.c:181
#define MUL16(ra, rb)
Definition: mathops.h:88
DCTContext dst
contexts for phase shift (in Hilbert transform, part of postfilter)
Definition: wmavoice.c:272
int lsp_def_mode
defines different sets of LSP defaults [0, 1]
Definition: wmavoice.c:161
Definition: vlc.h:26
uint64_t channel_layout
Audio channel layout.
Definition: avcodec.h:2500
void(* rdft_calc)(struct RDFTContext *s, FFTSample *z)
Definition: rdft.h:60
static int put_bits_count(PutBitContext *s)
Definition: put_bits.h:85
#define powf(x, y)
Definition: libm.h:50
int skip_bits_next
number of bits to skip at the next call to wmavoice_decode_packet() (since they're part of the previo...
Definition: wmavoice.c:202
static void dequant_lsp16r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:987
int min_pitch_val
base value for pitch parsing code
Definition: wmavoice.c:168
WMA Voice decoding context.
Definition: wmavoice.c:132
static void wiener_denoise(WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs)
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it...
Definition: wmavoice.c:725
int denoise_strength
strength of denoising in Wiener filter [0-11]
Definition: wmavoice.c:152
uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE+AV_INPUT_BUFFER_PADDING_SIZE]
cache for superframe data split over multiple packets
Definition: wmavoice.c:206
comfort noise during silence generated from a hardcoded (fixed) codebook with per-frame (low) gain va...
Definition: wmavoice.c:83
audio channel layout utility functions
Definition: avfft.h:97
#define FFMIN(a, b)
Definition: common.h:96
#define log_range(var, assign)
#define MAX_LSPS
maximum filter order
Definition: wmavoice.c:47
static VLC frame_type_vlc
Frame type VLC coding.
Definition: wmavoice.c:62
int pitch_nbits
number of bits used to specify the pitch value in the frame header
Definition: wmavoice.c:170
#define MAX_BLOCKS
maximum number of blocks per frame
Definition: wmavoice.c:46
float denoise_coeffs_pf[0x80]
aligned buffer for denoise coefficients
Definition: wmavoice.c:286
static void dequant_lsp10r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:918
float y[10]
Definition: acelp_vectors.h:56
AVFormatContext * ctx
Definition: movenc.c:48
static av_always_inline unsigned UMULH(unsigned a, unsigned b)
Definition: mathops.h:68
#define a2
Definition: regdef.h:48
Definition: dct.h:31
float sin[511]
Definition: wmavoice.c:274
static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE(*table)[2], int bits, int max_depth)
Parse a vlc code.
Definition: get_bits.h:554
Definition: avfft.h:72
int n
Definition: avisynth_c.h:684
static int kalman_smoothen(WMAVoiceContext *s, int pitch, const float *in, float *out, int size)
Kalman smoothing function.
Definition: wmavoice.c:549
void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
Apply tilt compensation filter, 1 - tilt * z-1.
static const float wmavoice_gain_universal[64]
void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
Reconstruct LPC coefficients from the line spectral pair frequencies.
Definition: lsp.c:209
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
Set up decoder with parameters from demuxer (extradata etc.).
Definition: wmavoice.c:372
#define AVERROR_PATCHWELCOME
Not yet implemented in FFmpeg, patches welcome.
Definition: error.h:62
int sframe_lsp_bitsize
size (in bits) of LSPs, when encoded per superframe (residual coding)
Definition: wmavoice.c:165
Per-block pitch with signal generation using a Hamming sinc window function.
Definition: wmavoice.c:74
static const uint8_t last_coeff[3]
Definition: qdm2data.h:257
static const struct frame_type_desc frame_descs[17]
float denoise_filter_cache[MAX_FRAMESIZE]
Definition: wmavoice.c:282
Libavcodec external API header.
int sample_rate
samples per second
Definition: avcodec.h:2449
void AAC_RENAME() ff_sine_window_init(INTFLOAT *window, int n)
Generate a sine window.
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt)
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer...
Definition: wmavoice.c:1898
main external API structure.
Definition: avcodec.h:1687
static int parse_packet_header(WMAVoiceContext *s)
Parse the packet header at the start of each packet (input data to this decoder). ...
Definition: wmavoice.c:1837
Innovation (fixed) codebook pulse sets in combinations of either single pulses or pulse pairs...
Definition: wmavoice.c:90
int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
Get a buffer for a frame.
Definition: utils.c:948
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> in
AVCodec ff_wmavoice_decoder
Definition: wmavoice.c:1993
int8_t vbm_tree[25]
converts VLC codes to frame type
Definition: wmavoice.c:141
int extradata_size
Definition: avcodec.h:1803
static unsigned int get_bits1(GetBitContext *s)
Definition: get_bits.h:313
static void synth_block(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth)
Parse data in a single block.
Definition: wmavoice.c:1441
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
Definition: wmavoice.c:1979
static void skip_bits(GetBitContext *s, int n)
Definition: get_bits.h:306
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
Set up DCT.
Definition: dct.c:177
#define AV_CODEC_CAP_SUBFRAMES
Codec can output multiple frames per AVPacket Normally demuxers return one frame at a time...
Definition: avcodec.h:1014
int pitch_diff_sh16
((cur_pitch_val - last_pitch_val) << 16) / MAX_FRAMESIZE
Definition: wmavoice.c:229
static int init_get_bits(GetBitContext *s, const uint8_t *buffer, int bit_size)
Initialize GetBitContext.
Definition: get_bits.h:425
#define MAX_SFRAMESIZE
maximum number of samples per superframe
Definition: wmavoice.c:53
int lsp_q_mode
defines quantizer defaults [0, 1]
Definition: wmavoice.c:160
int frame_cntr
current frame index [0 - 0xFFFE]; is only used for comfort noise in pRNG()
Definition: wmavoice.c:252
void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP zero synthesis filter.
Definition: celp_filters.c:199
#define u(width,...)
static void adaptive_gain_control(float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem)
Adaptive gain control (as used in postfilter).
Definition: wmavoice.c:508
static const float mean_lsf[10]
Definition: siprdata.h:27
#define SFRAME_CACHE_MAXSIZE
maximum cache size for frame data that
Definition: wmavoice.c:55
uint8_t fcb_type
Fixed codebook type (FCB_TYPE_*)
Definition: wmavoice.c:103
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
Parse 16 independently-coded LSPs.
Definition: wmavoice.c:954
RDFTContext irdft
contexts for FFT-calculation in the postfilter (for denoise filter)
Definition: wmavoice.c:270
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:204
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr)
Synthesize output samples for a single superframe.
Definition: wmavoice.c:1705
#define M_LN10
Definition: mathematics.h:43
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation)
Parse hardcoded signal for a single block.
Definition: wmavoice.c:1276
uint8_t n_blocks
amount of blocks per frame (each block (contains 160/n_blocks samples)
Definition: wmavoice.c:99
common internal api header.
static void flush_put_bits(PutBitContext *s)
Pad the end of the output stream with zeros.
Definition: put_bits.h:101
if(ret< 0)
Definition: vf_mcdeint.c:282
static av_cold void wmavoice_init_static_data(AVCodec *codec)
Definition: wmavoice.c:319
int pitch_lag
Definition: acelp_vectors.h:58
float excitation_history[MAX_SIGNAL_HISTORY]
cache of the signal of previous superframes, used as a history for signal generation ...
Definition: wmavoice.c:256
static void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
Initialize the PutBitContext s.
Definition: put_bits.h:48
int last_pitch_val
pitch value of the previous frame
Definition: wmavoice.c:227
#define AV_INPUT_BUFFER_PADDING_SIZE
Required number of additionally allocated bytes at the end of the input bitstream for decoding...
Definition: avcodec.h:739
void * priv_data
Definition: avcodec.h:1729
#define MAX_FRAMESIZE
maximum number of samples per frame
Definition: wmavoice.c:51
float silence_gain
set for use in blocks if ACB_TYPE_NONE
Definition: wmavoice.c:231
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch.
Definition: wmavoice.c:69
static const double wmavoice_mean_lsf10[2][10]
static const int16_t coeffs[]
int len
int channels
number of audio channels
Definition: avcodec.h:2450
VLC_TYPE(* table)[2]
code, bits
Definition: vlc.h:28
#define lrint
Definition: tablegen.h:53
av_cold void ff_dct_end(DCTContext *s)
Definition: dct.c:220
void ff_acelp_interpolatef(float *out, const float *in, const float *filter_coeffs, int precision, int frac_pos, int filter_length, int length)
Floating point version of ff_acelp_interpolate()
Definition: acelp_filters.c:78
int block_delta_pitch_hrange
1/2 range of the delta (full range is from -this to +this-1)
Definition: wmavoice.c:179
int max_pitch_val
max value + 1 for pitch parsing
Definition: wmavoice.c:169
#define av_uninit(x)
Definition: attributes.h:149
int lsps
number of LSPs per frame [10 or 16]
Definition: wmavoice.c:159
FILE * out
Definition: movenc.c:54
#define MAX_FRAMES
maximum number of frames per superframe
Definition: wmavoice.c:50
static const float wmavoice_lsp16_intercoeff_b[32][2][16]
static int decode(AVCodecContext *avctx, AVFrame *frame, int *got_frame, AVPacket *pkt)
Definition: ffmpeg.c:2040
PutBitContext pb
bitstream writer for sframe_cache
Definition: wmavoice.c:214
#define M_PI
Definition: mathematics.h:52
uint8_t acb_type
Adaptive codebook type (ACB_TYPE_*)
Definition: wmavoice.c:102
static const float wmavoice_denoise_power_table[12][64]
LUT for f(x,y) = pow((y + 6.9) / 64, 0.025 * (x + 1)).
int dc_level
Predicted amount of DC noise, based on which a DC removal filter is used.
Definition: wmavoice.c:156
#define VLC_NBITS
number of bits to read per VLC iteration
Definition: wmavoice.c:57
static const float wmavoice_lsp16_intercoeff_a[32][2][16]
Definition: avfft.h:96
float cos[511]
8-bit cosine/sine windows over [-pi,pi] range
Definition: wmavoice.c:274
#define AV_CH_LAYOUT_MONO
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
Set up a real FFT.
Definition: rdft.c:99
int aw_pulse_range
the range over which aw_pulse_set1() can apply the pulse, relative to the value in aw_first_pulse_off...
Definition: wmavoice.c:235
float min
uint64_t_TMPL AV_WL64 unsigned int_TMPL AV_RL32
Definition: bytestream.h:87
This structure stores compressed data.
Definition: avcodec.h:1589
int nb_samples
number of audio samples (per channel) described by this frame
Definition: frame.h:247
float zero_exc_pf[MAX_SIGNAL_HISTORY+MAX_SFRAMESIZE]
zero filter output (i.e.
Definition: wmavoice.c:279
#define AV_CODEC_CAP_DR1
Codec uses get_buffer() for allocating buffers and supports custom allocators.
Definition: avcodec.h:964
for(j=16;j >0;--j)
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation)
Parse FCB/ACB signal for a single block.
Definition: wmavoice.c:1307
uint8_t dbl_pulses
how many pulse vectors have pulse pairs (rather than just one single pulse) only if fcb_type == FCB_T...
Definition: wmavoice.c:104
#define t2
Definition: regdef.h:30
#define MAX_SIGNAL_HISTORY
maximum excitation signal history
Definition: wmavoice.c:52
uint16_t frame_size
the amount of bits that make up the block data (per frame)
Definition: wmavoice.c:107
#define MULH
Definition: mathops.h:42
GetBitContext gb
packet bitreader.
Definition: wmavoice.c:137
static uint8_t tmp[11]
Definition: aes_ctr.c:26
bitstream writer API