FFmpeg
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem.h"
33 #include "libavutil/thread.h"
34 #include "avcodec.h"
35 #include "internal.h"
36 #include "get_bits.h"
37 #include "put_bits.h"
38 #include "wmavoice_data.h"
39 #include "celp_filters.h"
40 #include "acelp_vectors.h"
41 #include "acelp_filters.h"
42 #include "lsp.h"
43 #include "dct.h"
44 #include "rdft.h"
45 #include "sinewin.h"
46 
47 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
48 #define MAX_LSPS 16 ///< maximum filter order
49 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
50  ///< of 16 for ASM input buffer alignment
51 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
52 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
53 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
54 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
55  ///< maximum number of samples per superframe
56 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
57  ///< was split over two packets
58 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
59 
60 /**
61  * Frame type VLC coding.
62  */
64 
65 /**
66  * Adaptive codebook types.
67  */
68 enum {
69  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
70  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
71  ///< we interpolate to get a per-sample pitch.
72  ///< Signal is generated using an asymmetric sinc
73  ///< window function
74  ///< @note see #wmavoice_ipol1_coeffs
75  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
76  ///< a Hamming sinc window function
77  ///< @note see #wmavoice_ipol2_coeffs
78 };
79 
80 /**
81  * Fixed codebook types.
82  */
83 enum {
84  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
85  ///< generated from a hardcoded (fixed) codebook
86  ///< with per-frame (low) gain values
87  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
88  ///< gain values
89  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
90  ///< used in particular for low-bitrate streams
91  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
92  ///< combinations of either single pulses or
93  ///< pulse pairs
94 };
95 
96 /**
97  * Description of frame types.
98  */
99 static const struct frame_type_desc {
100  uint8_t n_blocks; ///< amount of blocks per frame (each block
101  ///< (contains 160/#n_blocks samples)
102  uint8_t log_n_blocks; ///< log2(#n_blocks)
103  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
104  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
105  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
106  ///< (rather than just one single pulse)
107  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
108 } frame_descs[17] = {
109  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0 },
110  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0 },
126 };
127 
128 /**
129  * WMA Voice decoding context.
130  */
131 typedef struct WMAVoiceContext {
132  /**
133  * @name Global values specified in the stream header / extradata or used all over.
134  * @{
135  */
136  GetBitContext gb; ///< packet bitreader. During decoder init,
137  ///< it contains the extradata from the
138  ///< demuxer. During decoding, it contains
139  ///< packet data.
140  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
141 
142  int spillover_bitsize; ///< number of bits used to specify
143  ///< #spillover_nbits in the packet header
144  ///< = ceil(log2(ctx->block_align << 3))
145  int history_nsamples; ///< number of samples in history for signal
146  ///< prediction (through ACB)
147 
148  /* postfilter specific values */
149  int do_apf; ///< whether to apply the averaged
150  ///< projection filter (APF)
151  int denoise_strength; ///< strength of denoising in Wiener filter
152  ///< [0-11]
153  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
154  ///< Wiener filter coefficients (postfilter)
155  int dc_level; ///< Predicted amount of DC noise, based
156  ///< on which a DC removal filter is used
157 
158  int lsps; ///< number of LSPs per frame [10 or 16]
159  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
160  int lsp_def_mode; ///< defines different sets of LSP defaults
161  ///< [0, 1]
162 
163  int min_pitch_val; ///< base value for pitch parsing code
164  int max_pitch_val; ///< max value + 1 for pitch parsing
165  int pitch_nbits; ///< number of bits used to specify the
166  ///< pitch value in the frame header
167  int block_pitch_nbits; ///< number of bits used to specify the
168  ///< first block's pitch value
169  int block_pitch_range; ///< range of the block pitch
170  int block_delta_pitch_nbits; ///< number of bits used to specify the
171  ///< delta pitch between this and the last
172  ///< block's pitch value, used in all but
173  ///< first block
174  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
175  ///< from -this to +this-1)
176  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
177  ///< conversion
178 
179  /**
180  * @}
181  *
182  * @name Packet values specified in the packet header or related to a packet.
183  *
184  * A packet is considered to be a single unit of data provided to this
185  * decoder by the demuxer.
186  * @{
187  */
188  int spillover_nbits; ///< number of bits of the previous packet's
189  ///< last superframe preceding this
190  ///< packet's first full superframe (useful
191  ///< for re-synchronization also)
192  int has_residual_lsps; ///< if set, superframes contain one set of
193  ///< LSPs that cover all frames, encoded as
194  ///< independent and residual LSPs; if not
195  ///< set, each frame contains its own, fully
196  ///< independent, LSPs
197  int skip_bits_next; ///< number of bits to skip at the next call
198  ///< to #wmavoice_decode_packet() (since
199  ///< they're part of the previous superframe)
200 
202  ///< cache for superframe data split over
203  ///< multiple packets
204  int sframe_cache_size; ///< set to >0 if we have data from an
205  ///< (incomplete) superframe from a previous
206  ///< packet that spilled over in the current
207  ///< packet; specifies the amount of bits in
208  ///< #sframe_cache
209  PutBitContext pb; ///< bitstream writer for #sframe_cache
210 
211  /**
212  * @}
213  *
214  * @name Frame and superframe values
215  * Superframe and frame data - these can change from frame to frame,
216  * although some of them do in that case serve as a cache / history for
217  * the next frame or superframe.
218  * @{
219  */
220  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
221  ///< superframe
222  int last_pitch_val; ///< pitch value of the previous frame
223  int last_acb_type; ///< frame type [0-2] of the previous frame
224  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
225  ///< << 16) / #MAX_FRAMESIZE
226  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
227 
228  int aw_idx_is_ext; ///< whether the AW index was encoded in
229  ///< 8 bits (instead of 6)
230  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
231  ///< can apply the pulse, relative to the
232  ///< value in aw_first_pulse_off. The exact
233  ///< position of the first AW-pulse is within
234  ///< [pulse_off, pulse_off + this], and
235  ///< depends on bitstream values; [16 or 24]
236  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
237  ///< that this number can be negative (in
238  ///< which case it basically means "zero")
239  int aw_first_pulse_off[2]; ///< index of first sample to which to
240  ///< apply AW-pulses, or -0xff if unset
241  int aw_next_pulse_off_cache; ///< the position (relative to start of the
242  ///< second block) at which pulses should
243  ///< start to be positioned, serves as a
244  ///< cache for pitch-adaptive window pulses
245  ///< between blocks
246 
247  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
248  ///< only used for comfort noise in #pRNG()
249  int nb_superframes; ///< number of superframes in current packet
250  float gain_pred_err[6]; ///< cache for gain prediction
251  float excitation_history[MAX_SIGNAL_HISTORY];
252  ///< cache of the signal of previous
253  ///< superframes, used as a history for
254  ///< signal generation
255  float synth_history[MAX_LSPS]; ///< see #excitation_history
256  /**
257  * @}
258  *
259  * @name Postfilter values
260  *
261  * Variables used for postfilter implementation, mostly history for
262  * smoothing and so on, and context variables for FFT/iFFT.
263  * @{
264  */
265  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
266  ///< postfilter (for denoise filter)
267  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
268  ///< transform, part of postfilter)
269  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
270  ///< range
271  float postfilter_agc; ///< gain control memory, used in
272  ///< #adaptive_gain_control()
273  float dcf_mem[2]; ///< DC filter history
274  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
275  ///< zero filter output (i.e. excitation)
276  ///< by postfilter
277  float denoise_filter_cache[MAX_FRAMESIZE];
278  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
279  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
280  ///< aligned buffer for LPC tilting
281  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
282  ///< aligned buffer for denoise coefficients
283  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
284  ///< aligned buffer for postfilter speech
285  ///< synthesis
286  /**
287  * @}
288  */
290 
291 /**
292  * Set up the variable bit mode (VBM) tree from container extradata.
293  * @param gb bit I/O context.
294  * The bit context (s->gb) should be loaded with byte 23-46 of the
295  * container extradata (i.e. the ones containing the VBM tree).
296  * @param vbm_tree pointer to array to which the decoded VBM tree will be
297  * written.
298  * @return 0 on success, <0 on error.
299  */
300 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
301 {
302  int cntr[8] = { 0 }, n, res;
303 
304  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
305  for (n = 0; n < 17; n++) {
306  res = get_bits(gb, 3);
307  if (cntr[res] > 3) // should be >= 3 + (res == 7))
308  return -1;
309  vbm_tree[res * 3 + cntr[res]++] = n;
310  }
311  return 0;
312 }
313 
315 {
316  static const uint8_t bits[] = {
317  2, 2, 2, 4, 4, 4,
318  6, 6, 6, 8, 8, 8,
319  10, 10, 10, 12, 12, 12,
320  14, 14, 14, 14
321  };
322  static const uint16_t codes[] = {
323  0x0000, 0x0001, 0x0002, // 00/01/10
324  0x000c, 0x000d, 0x000e, // 11+00/01/10
325  0x003c, 0x003d, 0x003e, // 1111+00/01/10
326  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
327  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
328  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
329  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
330  };
331 
332  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
333  bits, 1, 1, codes, 2, 2, 132);
334 }
335 
337 {
338  WMAVoiceContext *s = ctx->priv_data;
339  int n;
340 
341  s->postfilter_agc = 0;
342  s->sframe_cache_size = 0;
343  s->skip_bits_next = 0;
344  for (n = 0; n < s->lsps; n++)
345  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
346  memset(s->excitation_history, 0,
347  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
348  memset(s->synth_history, 0,
349  sizeof(*s->synth_history) * MAX_LSPS);
350  memset(s->gain_pred_err, 0,
351  sizeof(s->gain_pred_err));
352 
353  if (s->do_apf) {
354  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
355  sizeof(*s->synth_filter_out_buf) * s->lsps);
356  memset(s->dcf_mem, 0,
357  sizeof(*s->dcf_mem) * 2);
358  memset(s->zero_exc_pf, 0,
359  sizeof(*s->zero_exc_pf) * s->history_nsamples);
360  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
361  }
362 }
363 
364 /**
365  * Set up decoder with parameters from demuxer (extradata etc.).
366  */
368 {
369  static AVOnce init_static_once = AV_ONCE_INIT;
370  int n, flags, pitch_range, lsp16_flag;
371  WMAVoiceContext *s = ctx->priv_data;
372 
373  ff_thread_once(&init_static_once, wmavoice_init_static_data);
374 
375  /**
376  * Extradata layout:
377  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
378  * - byte 19-22: flags field (annoyingly in LE; see below for known
379  * values),
380  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
381  * rest is 0).
382  */
383  if (ctx->extradata_size != 46) {
384  av_log(ctx, AV_LOG_ERROR,
385  "Invalid extradata size %d (should be 46)\n",
386  ctx->extradata_size);
387  return AVERROR_INVALIDDATA;
388  }
389  if (ctx->block_align <= 0) {
390  av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
391  return AVERROR_INVALIDDATA;
392  }
393 
394  flags = AV_RL32(ctx->extradata + 18);
395  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
396  s->do_apf = flags & 0x1;
397  if (s->do_apf) {
398  ff_rdft_init(&s->rdft, 7, DFT_R2C);
399  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
400  ff_dct_init(&s->dct, 6, DCT_I);
401  ff_dct_init(&s->dst, 6, DST_I);
402 
403  ff_sine_window_init(s->cos, 256);
404  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
405  for (n = 0; n < 255; n++) {
406  s->sin[n] = -s->sin[510 - n];
407  s->cos[510 - n] = s->cos[n];
408  }
409  }
410  s->denoise_strength = (flags >> 2) & 0xF;
411  if (s->denoise_strength >= 12) {
412  av_log(ctx, AV_LOG_ERROR,
413  "Invalid denoise filter strength %d (max=11)\n",
414  s->denoise_strength);
415  return AVERROR_INVALIDDATA;
416  }
417  s->denoise_tilt_corr = !!(flags & 0x40);
418  s->dc_level = (flags >> 7) & 0xF;
419  s->lsp_q_mode = !!(flags & 0x2000);
420  s->lsp_def_mode = !!(flags & 0x4000);
421  lsp16_flag = flags & 0x1000;
422  if (lsp16_flag) {
423  s->lsps = 16;
424  } else {
425  s->lsps = 10;
426  }
427  for (n = 0; n < s->lsps; n++)
428  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
429 
430  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
431  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
432  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
433  return AVERROR_INVALIDDATA;
434  }
435 
436  if (ctx->sample_rate >= INT_MAX / (256 * 37))
437  return AVERROR_INVALIDDATA;
438 
439  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
440  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
441  pitch_range = s->max_pitch_val - s->min_pitch_val;
442  if (pitch_range <= 0) {
443  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
444  return AVERROR_INVALIDDATA;
445  }
446  s->pitch_nbits = av_ceil_log2(pitch_range);
447  s->last_pitch_val = 40;
449  s->history_nsamples = s->max_pitch_val + 8;
450 
452  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
453  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
454 
455  av_log(ctx, AV_LOG_ERROR,
456  "Unsupported samplerate %d (min=%d, max=%d)\n",
457  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
458 
459  return AVERROR(ENOSYS);
460  }
461 
462  s->block_conv_table[0] = s->min_pitch_val;
463  s->block_conv_table[1] = (pitch_range * 25) >> 6;
464  s->block_conv_table[2] = (pitch_range * 44) >> 6;
465  s->block_conv_table[3] = s->max_pitch_val - 1;
466  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
467  if (s->block_delta_pitch_hrange <= 0) {
468  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
469  return AVERROR_INVALIDDATA;
470  }
471  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
473  s->block_conv_table[3] + 1 +
474  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
475  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
476 
477  ctx->channels = 1;
480 
481  return 0;
482 }
483 
484 /**
485  * @name Postfilter functions
486  * Postfilter functions (gain control, wiener denoise filter, DC filter,
487  * kalman smoothening, plus surrounding code to wrap it)
488  * @{
489  */
490 /**
491  * Adaptive gain control (as used in postfilter).
492  *
493  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
494  * that the energy here is calculated using sum(abs(...)), whereas the
495  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
496  *
497  * @param out output buffer for filtered samples
498  * @param in input buffer containing the samples as they are after the
499  * postfilter steps so far
500  * @param speech_synth input buffer containing speech synth before postfilter
501  * @param size input buffer size
502  * @param alpha exponential filter factor
503  * @param gain_mem pointer to filter memory (single float)
504  */
505 static void adaptive_gain_control(float *out, const float *in,
506  const float *speech_synth,
507  int size, float alpha, float *gain_mem)
508 {
509  int i;
510  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
511  float mem = *gain_mem;
512 
513  for (i = 0; i < size; i++) {
514  speech_energy += fabsf(speech_synth[i]);
515  postfilter_energy += fabsf(in[i]);
516  }
517  gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
518  (1.0 - alpha) * speech_energy / postfilter_energy;
519 
520  for (i = 0; i < size; i++) {
521  mem = alpha * mem + gain_scale_factor;
522  out[i] = in[i] * mem;
523  }
524 
525  *gain_mem = mem;
526 }
527 
528 /**
529  * Kalman smoothing function.
530  *
531  * This function looks back pitch +/- 3 samples back into history to find
532  * the best fitting curve (that one giving the optimal gain of the two
533  * signals, i.e. the highest dot product between the two), and then
534  * uses that signal history to smoothen the output of the speech synthesis
535  * filter.
536  *
537  * @param s WMA Voice decoding context
538  * @param pitch pitch of the speech signal
539  * @param in input speech signal
540  * @param out output pointer for smoothened signal
541  * @param size input/output buffer size
542  *
543  * @returns -1 if no smoothening took place, e.g. because no optimal
544  * fit could be found, or 0 on success.
545  */
546 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
547  const float *in, float *out, int size)
548 {
549  int n;
550  float optimal_gain = 0, dot;
551  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
552  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
553  *best_hist_ptr = NULL;
554 
555  /* find best fitting point in history */
556  do {
557  dot = avpriv_scalarproduct_float_c(in, ptr, size);
558  if (dot > optimal_gain) {
559  optimal_gain = dot;
560  best_hist_ptr = ptr;
561  }
562  } while (--ptr >= end);
563 
564  if (optimal_gain <= 0)
565  return -1;
566  dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
567  if (dot <= 0) // would be 1.0
568  return -1;
569 
570  if (optimal_gain <= dot) {
571  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
572  } else
573  dot = 0.625;
574 
575  /* actual smoothing */
576  for (n = 0; n < size; n++)
577  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
578 
579  return 0;
580 }
581 
582 /**
583  * Get the tilt factor of a formant filter from its transfer function
584  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
585  * but somehow (??) it does a speech synthesis filter in the
586  * middle, which is missing here
587  *
588  * @param lpcs LPC coefficients
589  * @param n_lpcs Size of LPC buffer
590  * @returns the tilt factor
591  */
592 static float tilt_factor(const float *lpcs, int n_lpcs)
593 {
594  float rh0, rh1;
595 
596  rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
597  rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
598 
599  return rh1 / rh0;
600 }
601 
602 /**
603  * Derive denoise filter coefficients (in real domain) from the LPCs.
604  */
605 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
606  int fcb_type, float *coeffs, int remainder)
607 {
608  float last_coeff, min = 15.0, max = -15.0;
609  float irange, angle_mul, gain_mul, range, sq;
610  int n, idx;
611 
612  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
613  s->rdft.rdft_calc(&s->rdft, lpcs);
614 #define log_range(var, assign) do { \
615  float tmp = log10f(assign); var = tmp; \
616  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
617  } while (0)
618  log_range(last_coeff, lpcs[1] * lpcs[1]);
619  for (n = 1; n < 64; n++)
620  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
621  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
622  log_range(lpcs[0], lpcs[0] * lpcs[0]);
623 #undef log_range
624  range = max - min;
625  lpcs[64] = last_coeff;
626 
627  /* Now, use this spectrum to pick out these frequencies with higher
628  * (relative) power/energy (which we then take to be "not noise"),
629  * and set up a table (still in lpc[]) of (relative) gains per frequency.
630  * These frequencies will be maintained, while others ("noise") will be
631  * decreased in the filter output. */
632  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
633  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
634  (5.0 / 14.7));
635  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
636  for (n = 0; n <= 64; n++) {
637  float pwr;
638 
639  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
641  lpcs[n] = angle_mul * pwr;
642 
643  /* 70.57 =~ 1/log10(1.0331663) */
644  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
645  if (idx > 127) { // fall back if index falls outside table range
646  coeffs[n] = wmavoice_energy_table[127] *
647  powf(1.0331663, idx - 127);
648  } else
649  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
650  }
651 
652  /* calculate the Hilbert transform of the gains, which we do (since this
653  * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
654  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
655  * "moment" of the LPCs in this filter. */
656  s->dct.dct_calc(&s->dct, lpcs);
657  s->dst.dct_calc(&s->dst, lpcs);
658 
659  /* Split out the coefficient indexes into phase/magnitude pairs */
660  idx = 255 + av_clip(lpcs[64], -255, 255);
661  coeffs[0] = coeffs[0] * s->cos[idx];
662  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
663  last_coeff = coeffs[64] * s->cos[idx];
664  for (n = 63;; n--) {
665  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
666  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
667  coeffs[n * 2] = coeffs[n] * s->cos[idx];
668 
669  if (!--n) break;
670 
671  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
672  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
673  coeffs[n * 2] = coeffs[n] * s->cos[idx];
674  }
675  coeffs[1] = last_coeff;
676 
677  /* move into real domain */
678  s->irdft.rdft_calc(&s->irdft, coeffs);
679 
680  /* tilt correction and normalize scale */
681  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
682  if (s->denoise_tilt_corr) {
683  float tilt_mem = 0;
684 
685  coeffs[remainder - 1] = 0;
686  ff_tilt_compensation(&tilt_mem,
687  -1.8 * tilt_factor(coeffs, remainder - 1),
688  coeffs, remainder);
689  }
690  sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
691  remainder));
692  for (n = 0; n < remainder; n++)
693  coeffs[n] *= sq;
694 }
695 
696 /**
697  * This function applies a Wiener filter on the (noisy) speech signal as
698  * a means to denoise it.
699  *
700  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
701  * - using this power spectrum, calculate (for each frequency) the Wiener
702  * filter gain, which depends on the frequency power and desired level
703  * of noise subtraction (when set too high, this leads to artifacts)
704  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
705  * of 4-8kHz);
706  * - by doing a phase shift, calculate the Hilbert transform of this array
707  * of per-frequency filter-gains to get the filtering coefficients;
708  * - smoothen/normalize/de-tilt these filter coefficients as desired;
709  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
710  * to get the denoised speech signal;
711  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
712  * the frame boundary) are saved and applied to subsequent frames by an
713  * overlap-add method (otherwise you get clicking-artifacts).
714  *
715  * @param s WMA Voice decoding context
716  * @param fcb_type Frame (codebook) type
717  * @param synth_pf input: the noisy speech signal, output: denoised speech
718  * data; should be 16-byte aligned (for ASM purposes)
719  * @param size size of the speech data
720  * @param lpcs LPCs used to synthesize this frame's speech data
721  */
723  float *synth_pf, int size,
724  const float *lpcs)
725 {
726  int remainder, lim, n;
727 
728  if (fcb_type != FCB_TYPE_SILENCE) {
729  float *tilted_lpcs = s->tilted_lpcs_pf,
730  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
731 
732  tilted_lpcs[0] = 1.0;
733  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
734  memset(&tilted_lpcs[s->lsps + 1], 0,
735  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
736  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
737  tilted_lpcs, s->lsps + 2);
738 
739  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
740  * size is applied to the next frame. All input beyond this is zero,
741  * and thus all output beyond this will go towards zero, hence we can
742  * limit to min(size-1, 127-size) as a performance consideration. */
743  remainder = FFMIN(127 - size, size - 1);
744  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
745 
746  /* apply coefficients (in frequency spectrum domain), i.e. complex
747  * number multiplication */
748  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
749  s->rdft.rdft_calc(&s->rdft, synth_pf);
750  s->rdft.rdft_calc(&s->rdft, coeffs);
751  synth_pf[0] *= coeffs[0];
752  synth_pf[1] *= coeffs[1];
753  for (n = 1; n < 64; n++) {
754  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
755  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
756  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
757  }
758  s->irdft.rdft_calc(&s->irdft, synth_pf);
759  }
760 
761  /* merge filter output with the history of previous runs */
762  if (s->denoise_filter_cache_size) {
763  lim = FFMIN(s->denoise_filter_cache_size, size);
764  for (n = 0; n < lim; n++)
765  synth_pf[n] += s->denoise_filter_cache[n];
766  s->denoise_filter_cache_size -= lim;
767  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
769  }
770 
771  /* move remainder of filter output into a cache for future runs */
772  if (fcb_type != FCB_TYPE_SILENCE) {
773  lim = FFMIN(remainder, s->denoise_filter_cache_size);
774  for (n = 0; n < lim; n++)
775  s->denoise_filter_cache[n] += synth_pf[size + n];
776  if (lim < remainder) {
777  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
778  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
779  s->denoise_filter_cache_size = remainder;
780  }
781  }
782 }
783 
784 /**
785  * Averaging projection filter, the postfilter used in WMAVoice.
786  *
787  * This uses the following steps:
788  * - A zero-synthesis filter (generate excitation from synth signal)
789  * - Kalman smoothing on excitation, based on pitch
790  * - Re-synthesized smoothened output
791  * - Iterative Wiener denoise filter
792  * - Adaptive gain filter
793  * - DC filter
794  *
795  * @param s WMAVoice decoding context
796  * @param synth Speech synthesis output (before postfilter)
797  * @param samples Output buffer for filtered samples
798  * @param size Buffer size of synth & samples
799  * @param lpcs Generated LPCs used for speech synthesis
800  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
801  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
802  * @param pitch Pitch of the input signal
803  */
804 static void postfilter(WMAVoiceContext *s, const float *synth,
805  float *samples, int size,
806  const float *lpcs, float *zero_exc_pf,
807  int fcb_type, int pitch)
808 {
809  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
810  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
811  *synth_filter_in = zero_exc_pf;
812 
813  av_assert0(size <= MAX_FRAMESIZE / 2);
814 
815  /* generate excitation from input signal */
816  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
817 
818  if (fcb_type >= FCB_TYPE_AW_PULSES &&
819  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
820  synth_filter_in = synth_filter_in_buf;
821 
822  /* re-synthesize speech after smoothening, and keep history */
823  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
824  synth_filter_in, size, s->lsps);
825  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
826  sizeof(synth_pf[0]) * s->lsps);
827 
828  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
829 
830  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
831  &s->postfilter_agc);
832 
833  if (s->dc_level > 8) {
834  /* remove ultra-low frequency DC noise / highpass filter;
835  * coefficients are identical to those used in SIPR decoding,
836  * and very closely resemble those used in AMR-NB decoding. */
838  (const float[2]) { -1.99997, 1.0 },
839  (const float[2]) { -1.9330735188, 0.93589198496 },
840  0.93980580475, s->dcf_mem, size);
841  }
842 }
843 /**
844  * @}
845  */
846 
847 /**
848  * Dequantize LSPs
849  * @param lsps output pointer to the array that will hold the LSPs
850  * @param num number of LSPs to be dequantized
851  * @param values quantized values, contains n_stages values
852  * @param sizes range (i.e. max value) of each quantized value
853  * @param n_stages number of dequantization runs
854  * @param table dequantization table to be used
855  * @param mul_q LSF multiplier
856  * @param base_q base (lowest) LSF values
857  */
858 static void dequant_lsps(double *lsps, int num,
859  const uint16_t *values,
860  const uint16_t *sizes,
861  int n_stages, const uint8_t *table,
862  const double *mul_q,
863  const double *base_q)
864 {
865  int n, m;
866 
867  memset(lsps, 0, num * sizeof(*lsps));
868  for (n = 0; n < n_stages; n++) {
869  const uint8_t *t_off = &table[values[n] * num];
870  double base = base_q[n], mul = mul_q[n];
871 
872  for (m = 0; m < num; m++)
873  lsps[m] += base + mul * t_off[m];
874 
875  table += sizes[n] * num;
876  }
877 }
878 
879 /**
880  * @name LSP dequantization routines
881  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
882  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
883  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
884  * @{
885  */
886 /**
887  * Parse 10 independently-coded LSPs.
888  */
889 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
890 {
891  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
892  static const double mul_lsf[4] = {
893  5.2187144800e-3, 1.4626986422e-3,
894  9.6179549166e-4, 1.1325736225e-3
895  };
896  static const double base_lsf[4] = {
897  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
898  M_PI * -3.3486e-2, M_PI * -5.7408e-2
899  };
900  uint16_t v[4];
901 
902  v[0] = get_bits(gb, 8);
903  v[1] = get_bits(gb, 6);
904  v[2] = get_bits(gb, 5);
905  v[3] = get_bits(gb, 5);
906 
907  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
908  mul_lsf, base_lsf);
909 }
910 
911 /**
912  * Parse 10 independently-coded LSPs, and then derive the tables to
913  * generate LSPs for the other frames from them (residual coding).
914  */
916  double *i_lsps, const double *old,
917  double *a1, double *a2, int q_mode)
918 {
919  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
920  static const double mul_lsf[3] = {
921  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
922  };
923  static const double base_lsf[3] = {
924  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
925  };
926  const float (*ipol_tab)[2][10] = q_mode ?
928  uint16_t interpol, v[3];
929  int n;
930 
931  dequant_lsp10i(gb, i_lsps);
932 
933  interpol = get_bits(gb, 5);
934  v[0] = get_bits(gb, 7);
935  v[1] = get_bits(gb, 6);
936  v[2] = get_bits(gb, 6);
937 
938  for (n = 0; n < 10; n++) {
939  double delta = old[n] - i_lsps[n];
940  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
941  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
942  }
943 
944  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
945  mul_lsf, base_lsf);
946 }
947 
948 /**
949  * Parse 16 independently-coded LSPs.
950  */
951 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
952 {
953  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
954  static const double mul_lsf[5] = {
955  3.3439586280e-3, 6.9908173703e-4,
956  3.3216608306e-3, 1.0334960326e-3,
957  3.1899104283e-3
958  };
959  static const double base_lsf[5] = {
960  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
961  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
962  M_PI * -1.29816e-1
963  };
964  uint16_t v[5];
965 
966  v[0] = get_bits(gb, 8);
967  v[1] = get_bits(gb, 6);
968  v[2] = get_bits(gb, 7);
969  v[3] = get_bits(gb, 6);
970  v[4] = get_bits(gb, 7);
971 
972  dequant_lsps( lsps, 5, v, vec_sizes, 2,
973  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
974  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
975  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
976  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
977  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
978 }
979 
980 /**
981  * Parse 16 independently-coded LSPs, and then derive the tables to
982  * generate LSPs for the other frames from them (residual coding).
983  */
985  double *i_lsps, const double *old,
986  double *a1, double *a2, int q_mode)
987 {
988  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
989  static const double mul_lsf[3] = {
990  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
991  };
992  static const double base_lsf[3] = {
993  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
994  };
995  const float (*ipol_tab)[2][16] = q_mode ?
997  uint16_t interpol, v[3];
998  int n;
999 
1000  dequant_lsp16i(gb, i_lsps);
1001 
1002  interpol = get_bits(gb, 5);
1003  v[0] = get_bits(gb, 7);
1004  v[1] = get_bits(gb, 7);
1005  v[2] = get_bits(gb, 7);
1006 
1007  for (n = 0; n < 16; n++) {
1008  double delta = old[n] - i_lsps[n];
1009  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1010  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1011  }
1012 
1013  dequant_lsps( a2, 10, v, vec_sizes, 1,
1014  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
1015  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1016  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1017  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1018  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1019 }
1020 
1021 /**
1022  * @}
1023  * @name Pitch-adaptive window coding functions
1024  * The next few functions are for pitch-adaptive window coding.
1025  * @{
1026  */
1027 /**
1028  * Parse the offset of the first pitch-adaptive window pulses, and
1029  * the distribution of pulses between the two blocks in this frame.
1030  * @param s WMA Voice decoding context private data
1031  * @param gb bit I/O context
1032  * @param pitch pitch for each block in this frame
1033  */
1035  const int *pitch)
1036 {
1037  static const int16_t start_offset[94] = {
1038  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1039  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1040  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1041  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1042  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1043  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1044  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1045  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1046  };
1047  int bits, offset;
1048 
1049  /* position of pulse */
1050  s->aw_idx_is_ext = 0;
1051  if ((bits = get_bits(gb, 6)) >= 54) {
1052  s->aw_idx_is_ext = 1;
1053  bits += (bits - 54) * 3 + get_bits(gb, 2);
1054  }
1055 
1056  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1057  * the distribution of the pulses in each block contained in this frame. */
1058  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1059  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1060  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1061  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1062  offset += s->aw_n_pulses[0] * pitch[0];
1063  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1064  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1065 
1066  /* if continuing from a position before the block, reset position to
1067  * start of block (when corrected for the range over which it can be
1068  * spread in aw_pulse_set1()). */
1069  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1070  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1071  s->aw_first_pulse_off[1] -= pitch[1];
1072  if (start_offset[bits] < 0)
1073  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1074  s->aw_first_pulse_off[0] -= pitch[0];
1075  }
1076 }
1077 
1078 /**
1079  * Apply second set of pitch-adaptive window pulses.
1080  * @param s WMA Voice decoding context private data
1081  * @param gb bit I/O context
1082  * @param block_idx block index in frame [0, 1]
1083  * @param fcb structure containing fixed codebook vector info
1084  * @return -1 on error, 0 otherwise
1085  */
1087  int block_idx, AMRFixed *fcb)
1088 {
1089  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1090  uint16_t *use_mask = use_mask_mem + 2;
1091  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1092  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1093  * of idx are the position of the bit within a particular item in the
1094  * array (0 being the most significant bit, and 15 being the least
1095  * significant bit), and the remainder (>> 4) is the index in the
1096  * use_mask[]-array. This is faster and uses less memory than using a
1097  * 80-byte/80-int array. */
1098  int pulse_off = s->aw_first_pulse_off[block_idx],
1099  pulse_start, n, idx, range, aidx, start_off = 0;
1100 
1101  /* set offset of first pulse to within this block */
1102  if (s->aw_n_pulses[block_idx] > 0)
1103  while (pulse_off + s->aw_pulse_range < 1)
1104  pulse_off += fcb->pitch_lag;
1105 
1106  /* find range per pulse */
1107  if (s->aw_n_pulses[0] > 0) {
1108  if (block_idx == 0) {
1109  range = 32;
1110  } else /* block_idx = 1 */ {
1111  range = 8;
1112  if (s->aw_n_pulses[block_idx] > 0)
1113  pulse_off = s->aw_next_pulse_off_cache;
1114  }
1115  } else
1116  range = 16;
1117  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1118 
1119  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1120  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1121  * we exclude that range from being pulsed again in this function. */
1122  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1123  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1124  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1125  if (s->aw_n_pulses[block_idx] > 0)
1126  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1127  int excl_range = s->aw_pulse_range; // always 16 or 24
1128  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1129  int first_sh = 16 - (idx & 15);
1130  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1131  excl_range -= first_sh;
1132  if (excl_range >= 16) {
1133  *use_mask_ptr++ = 0;
1134  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1135  } else
1136  *use_mask_ptr &= 0xFFFF >> excl_range;
1137  }
1138 
1139  /* find the 'aidx'th offset that is not excluded */
1140  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1141  for (n = 0; n <= aidx; pulse_start++) {
1142  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1143  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1144  if (use_mask[0]) idx = 0x0F;
1145  else if (use_mask[1]) idx = 0x1F;
1146  else if (use_mask[2]) idx = 0x2F;
1147  else if (use_mask[3]) idx = 0x3F;
1148  else if (use_mask[4]) idx = 0x4F;
1149  else return -1;
1150  idx -= av_log2_16bit(use_mask[idx >> 4]);
1151  }
1152  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1153  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1154  n++;
1155  start_off = idx;
1156  }
1157  }
1158 
1159  fcb->x[fcb->n] = start_off;
1160  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1161  fcb->n++;
1162 
1163  /* set offset for next block, relative to start of that block */
1164  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1165  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1166  return 0;
1167 }
1168 
1169 /**
1170  * Apply first set of pitch-adaptive window pulses.
1171  * @param s WMA Voice decoding context private data
1172  * @param gb bit I/O context
1173  * @param block_idx block index in frame [0, 1]
1174  * @param fcb storage location for fixed codebook pulse info
1175  */
1177  int block_idx, AMRFixed *fcb)
1178 {
1179  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1180  float v;
1181 
1182  if (s->aw_n_pulses[block_idx] > 0) {
1183  int n, v_mask, i_mask, sh, n_pulses;
1184 
1185  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1186  n_pulses = 3;
1187  v_mask = 8;
1188  i_mask = 7;
1189  sh = 4;
1190  } else { // 4 pulses, 1:sign + 2:index each
1191  n_pulses = 4;
1192  v_mask = 4;
1193  i_mask = 3;
1194  sh = 3;
1195  }
1196 
1197  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1198  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1199  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1200  s->aw_first_pulse_off[block_idx];
1201  while (fcb->x[fcb->n] < 0)
1202  fcb->x[fcb->n] += fcb->pitch_lag;
1203  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1204  fcb->n++;
1205  }
1206  } else {
1207  int num2 = (val & 0x1FF) >> 1, delta, idx;
1208 
1209  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1210  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1211  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1212  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1213  v = (val & 0x200) ? -1.0 : 1.0;
1214 
1215  fcb->no_repeat_mask |= 3 << fcb->n;
1216  fcb->x[fcb->n] = idx - delta;
1217  fcb->y[fcb->n] = v;
1218  fcb->x[fcb->n + 1] = idx;
1219  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1220  fcb->n += 2;
1221  }
1222 }
1223 
1224 /**
1225  * @}
1226  *
1227  * Generate a random number from frame_cntr and block_idx, which will live
1228  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1229  * table of size 1000 of which you want to read block_size entries).
1230  *
1231  * @param frame_cntr current frame number
1232  * @param block_num current block index
1233  * @param block_size amount of entries we want to read from a table
1234  * that has 1000 entries
1235  * @return a (non-)random number in the [0, 1000 - block_size] range.
1236  */
1237 static int pRNG(int frame_cntr, int block_num, int block_size)
1238 {
1239  /* array to simplify the calculation of z:
1240  * y = (x % 9) * 5 + 6;
1241  * z = (49995 * x) / y;
1242  * Since y only has 9 values, we can remove the division by using a
1243  * LUT and using FASTDIV-style divisions. For each of the 9 values
1244  * of y, we can rewrite z as:
1245  * z = x * (49995 / y) + x * ((49995 % y) / y)
1246  * In this table, each col represents one possible value of y, the
1247  * first number is 49995 / y, and the second is the FASTDIV variant
1248  * of 49995 % y / y. */
1249  static const unsigned int div_tbl[9][2] = {
1250  { 8332, 3 * 715827883U }, // y = 6
1251  { 4545, 0 * 390451573U }, // y = 11
1252  { 3124, 11 * 268435456U }, // y = 16
1253  { 2380, 15 * 204522253U }, // y = 21
1254  { 1922, 23 * 165191050U }, // y = 26
1255  { 1612, 23 * 138547333U }, // y = 31
1256  { 1388, 27 * 119304648U }, // y = 36
1257  { 1219, 16 * 104755300U }, // y = 41
1258  { 1086, 39 * 93368855U } // y = 46
1259  };
1260  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1261  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1262  // so this is effectively a modulo (%)
1263  y = x - 9 * MULH(477218589, x); // x % 9
1264  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1265  // z = x * 49995 / (y * 5 + 6)
1266  return z % (1000 - block_size);
1267 }
1268 
1269 /**
1270  * Parse hardcoded signal for a single block.
1271  * @note see #synth_block().
1272  */
1274  int block_idx, int size,
1275  const struct frame_type_desc *frame_desc,
1276  float *excitation)
1277 {
1278  float gain;
1279  int n, r_idx;
1280 
1281  av_assert0(size <= MAX_FRAMESIZE);
1282 
1283  /* Set the offset from which we start reading wmavoice_std_codebook */
1284  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1285  r_idx = pRNG(s->frame_cntr, block_idx, size);
1286  gain = s->silence_gain;
1287  } else /* FCB_TYPE_HARDCODED */ {
1288  r_idx = get_bits(gb, 8);
1289  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1290  }
1291 
1292  /* Clear gain prediction parameters */
1293  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1294 
1295  /* Apply gain to hardcoded codebook and use that as excitation signal */
1296  for (n = 0; n < size; n++)
1297  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1298 }
1299 
1300 /**
1301  * Parse FCB/ACB signal for a single block.
1302  * @note see #synth_block().
1303  */
1305  int block_idx, int size,
1306  int block_pitch_sh2,
1307  const struct frame_type_desc *frame_desc,
1308  float *excitation)
1309 {
1310  static const float gain_coeff[6] = {
1311  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1312  };
1313  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1314  int n, idx, gain_weight;
1315  AMRFixed fcb;
1316 
1317  av_assert0(size <= MAX_FRAMESIZE / 2);
1318  memset(pulses, 0, sizeof(*pulses) * size);
1319 
1320  fcb.pitch_lag = block_pitch_sh2 >> 2;
1321  fcb.pitch_fac = 1.0;
1322  fcb.no_repeat_mask = 0;
1323  fcb.n = 0;
1324 
1325  /* For the other frame types, this is where we apply the innovation
1326  * (fixed) codebook pulses of the speech signal. */
1327  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1328  aw_pulse_set1(s, gb, block_idx, &fcb);
1329  if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1330  /* Conceal the block with silence and return.
1331  * Skip the correct amount of bits to read the next
1332  * block from the correct offset. */
1333  int r_idx = pRNG(s->frame_cntr, block_idx, size);
1334 
1335  for (n = 0; n < size; n++)
1336  excitation[n] =
1337  wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1338  skip_bits(gb, 7 + 1);
1339  return;
1340  }
1341  } else /* FCB_TYPE_EXC_PULSES */ {
1342  int offset_nbits = 5 - frame_desc->log_n_blocks;
1343 
1344  fcb.no_repeat_mask = -1;
1345  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1346  * (instead of double) for a subset of pulses */
1347  for (n = 0; n < 5; n++) {
1348  float sign;
1349  int pos1, pos2;
1350 
1351  sign = get_bits1(gb) ? 1.0 : -1.0;
1352  pos1 = get_bits(gb, offset_nbits);
1353  fcb.x[fcb.n] = n + 5 * pos1;
1354  fcb.y[fcb.n++] = sign;
1355  if (n < frame_desc->dbl_pulses) {
1356  pos2 = get_bits(gb, offset_nbits);
1357  fcb.x[fcb.n] = n + 5 * pos2;
1358  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1359  }
1360  }
1361  }
1362  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1363 
1364  /* Calculate gain for adaptive & fixed codebook signal.
1365  * see ff_amr_set_fixed_gain(). */
1366  idx = get_bits(gb, 7);
1368  gain_coeff, 6) -
1369  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1370  acb_gain = wmavoice_gain_codebook_acb[idx];
1371  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1372  -2.9957322736 /* log(0.05) */,
1373  1.6094379124 /* log(5.0) */);
1374 
1375  gain_weight = 8 >> frame_desc->log_n_blocks;
1376  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1377  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1378  for (n = 0; n < gain_weight; n++)
1379  s->gain_pred_err[n] = pred_err;
1380 
1381  /* Calculation of adaptive codebook */
1382  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1383  int len;
1384  for (n = 0; n < size; n += len) {
1385  int next_idx_sh16;
1386  int abs_idx = block_idx * size + n;
1387  int pitch_sh16 = (s->last_pitch_val << 16) +
1388  s->pitch_diff_sh16 * abs_idx;
1389  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1390  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1391  idx = idx_sh16 >> 16;
1392  if (s->pitch_diff_sh16) {
1393  if (s->pitch_diff_sh16 > 0) {
1394  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1395  } else
1396  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1397  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1398  1, size - n);
1399  } else
1400  len = size;
1401 
1402  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1404  idx, 9, len);
1405  }
1406  } else /* ACB_TYPE_HAMMING */ {
1407  int block_pitch = block_pitch_sh2 >> 2;
1408  idx = block_pitch_sh2 & 3;
1409  if (idx) {
1410  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1412  idx, 8, size);
1413  } else
1414  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1415  sizeof(float) * size);
1416  }
1417 
1418  /* Interpolate ACB/FCB and use as excitation signal */
1419  ff_weighted_vector_sumf(excitation, excitation, pulses,
1420  acb_gain, fcb_gain, size);
1421 }
1422 
1423 /**
1424  * Parse data in a single block.
1425  *
1426  * @param s WMA Voice decoding context private data
1427  * @param gb bit I/O context
1428  * @param block_idx index of the to-be-read block
1429  * @param size amount of samples to be read in this block
1430  * @param block_pitch_sh2 pitch for this block << 2
1431  * @param lsps LSPs for (the end of) this frame
1432  * @param prev_lsps LSPs for the last frame
1433  * @param frame_desc frame type descriptor
1434  * @param excitation target memory for the ACB+FCB interpolated signal
1435  * @param synth target memory for the speech synthesis filter output
1436  * @return 0 on success, <0 on error.
1437  */
1439  int block_idx, int size,
1440  int block_pitch_sh2,
1441  const double *lsps, const double *prev_lsps,
1442  const struct frame_type_desc *frame_desc,
1443  float *excitation, float *synth)
1444 {
1445  double i_lsps[MAX_LSPS];
1446  float lpcs[MAX_LSPS];
1447  float fac;
1448  int n;
1449 
1450  if (frame_desc->acb_type == ACB_TYPE_NONE)
1451  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1452  else
1453  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1454  frame_desc, excitation);
1455 
1456  /* convert interpolated LSPs to LPCs */
1457  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1458  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1459  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1460  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1461 
1462  /* Speech synthesis */
1463  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1464 }
1465 
1466 /**
1467  * Synthesize output samples for a single frame.
1468  *
1469  * @param ctx WMA Voice decoder context
1470  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1471  * @param frame_idx Frame number within superframe [0-2]
1472  * @param samples pointer to output sample buffer, has space for at least 160
1473  * samples
1474  * @param lsps LSP array
1475  * @param prev_lsps array of previous frame's LSPs
1476  * @param excitation target buffer for excitation signal
1477  * @param synth target buffer for synthesized speech data
1478  * @return 0 on success, <0 on error.
1479  */
1480 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1481  float *samples,
1482  const double *lsps, const double *prev_lsps,
1483  float *excitation, float *synth)
1484 {
1485  WMAVoiceContext *s = ctx->priv_data;
1486  int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1487  int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1488 
1489  /* Parse frame type ("frame header"), see frame_descs */
1490  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1491 
1492  if (bd_idx < 0) {
1493  av_log(ctx, AV_LOG_ERROR,
1494  "Invalid frame type VLC code, skipping\n");
1495  return AVERROR_INVALIDDATA;
1496  }
1497 
1498  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1499 
1500  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1501  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1502  /* Pitch is provided per frame, which is interpreted as the pitch of
1503  * the last sample of the last block of this frame. We can interpolate
1504  * the pitch of other blocks (and even pitch-per-sample) by gradually
1505  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1506  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1507  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1508  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1509  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1510  if (s->last_acb_type == ACB_TYPE_NONE ||
1511  20 * abs(cur_pitch_val - s->last_pitch_val) >
1512  (cur_pitch_val + s->last_pitch_val))
1513  s->last_pitch_val = cur_pitch_val;
1514 
1515  /* pitch per block */
1516  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1517  int fac = n * 2 + 1;
1518 
1519  pitch[n] = (MUL16(fac, cur_pitch_val) +
1520  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1521  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1522  }
1523 
1524  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1525  s->pitch_diff_sh16 =
1526  (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1527  }
1528 
1529  /* Global gain (if silence) and pitch-adaptive window coordinates */
1530  switch (frame_descs[bd_idx].fcb_type) {
1531  case FCB_TYPE_SILENCE:
1533  break;
1534  case FCB_TYPE_AW_PULSES:
1535  aw_parse_coords(s, gb, pitch);
1536  break;
1537  }
1538 
1539  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1540  int bl_pitch_sh2;
1541 
1542  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1543  switch (frame_descs[bd_idx].acb_type) {
1544  case ACB_TYPE_HAMMING: {
1545  /* Pitch is given per block. Per-block pitches are encoded as an
1546  * absolute value for the first block, and then delta values
1547  * relative to this value) for all subsequent blocks. The scale of
1548  * this pitch value is semi-logarithmic compared to its use in the
1549  * decoder, so we convert it to normal scale also. */
1550  int block_pitch,
1551  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1552  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1553  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1554 
1555  if (n == 0) {
1556  block_pitch = get_bits(gb, s->block_pitch_nbits);
1557  } else
1558  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1560  /* Convert last_ so that any next delta is within _range */
1561  last_block_pitch = av_clip(block_pitch,
1563  s->block_pitch_range -
1565 
1566  /* Convert semi-log-style scale back to normal scale */
1567  if (block_pitch < t1) {
1568  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1569  } else {
1570  block_pitch -= t1;
1571  if (block_pitch < t2) {
1572  bl_pitch_sh2 =
1573  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1574  } else {
1575  block_pitch -= t2;
1576  if (block_pitch < t3) {
1577  bl_pitch_sh2 =
1578  (s->block_conv_table[2] + block_pitch) << 2;
1579  } else
1580  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1581  }
1582  }
1583  pitch[n] = bl_pitch_sh2 >> 2;
1584  break;
1585  }
1586 
1587  case ACB_TYPE_ASYMMETRIC: {
1588  bl_pitch_sh2 = pitch[n] << 2;
1589  break;
1590  }
1591 
1592  default: // ACB_TYPE_NONE has no pitch
1593  bl_pitch_sh2 = 0;
1594  break;
1595  }
1596 
1597  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1598  lsps, prev_lsps, &frame_descs[bd_idx],
1599  &excitation[n * block_nsamples],
1600  &synth[n * block_nsamples]);
1601  }
1602 
1603  /* Averaging projection filter, if applicable. Else, just copy samples
1604  * from synthesis buffer */
1605  if (s->do_apf) {
1606  double i_lsps[MAX_LSPS];
1607  float lpcs[MAX_LSPS];
1608 
1609  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1610  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1611  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1612  postfilter(s, synth, samples, 80, lpcs,
1613  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1614  frame_descs[bd_idx].fcb_type, pitch[0]);
1615 
1616  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1617  i_lsps[n] = cos(lsps[n]);
1618  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1619  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1620  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1621  frame_descs[bd_idx].fcb_type, pitch[0]);
1622  } else
1623  memcpy(samples, synth, 160 * sizeof(synth[0]));
1624 
1625  /* Cache values for next frame */
1626  s->frame_cntr++;
1627  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1628  s->last_acb_type = frame_descs[bd_idx].acb_type;
1629  switch (frame_descs[bd_idx].acb_type) {
1630  case ACB_TYPE_NONE:
1631  s->last_pitch_val = 0;
1632  break;
1633  case ACB_TYPE_ASYMMETRIC:
1634  s->last_pitch_val = cur_pitch_val;
1635  break;
1636  case ACB_TYPE_HAMMING:
1637  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1638  break;
1639  }
1640 
1641  return 0;
1642 }
1643 
1644 /**
1645  * Ensure minimum value for first item, maximum value for last value,
1646  * proper spacing between each value and proper ordering.
1647  *
1648  * @param lsps array of LSPs
1649  * @param num size of LSP array
1650  *
1651  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1652  * useful to put in a generic location later on. Parts are also
1653  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1654  * which is in float.
1655  */
1656 static void stabilize_lsps(double *lsps, int num)
1657 {
1658  int n, m, l;
1659 
1660  /* set minimum value for first, maximum value for last and minimum
1661  * spacing between LSF values.
1662  * Very similar to ff_set_min_dist_lsf(), but in double. */
1663  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1664  for (n = 1; n < num; n++)
1665  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1666  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1667 
1668  /* reorder (looks like one-time / non-recursed bubblesort).
1669  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1670  for (n = 1; n < num; n++) {
1671  if (lsps[n] < lsps[n - 1]) {
1672  for (m = 1; m < num; m++) {
1673  double tmp = lsps[m];
1674  for (l = m - 1; l >= 0; l--) {
1675  if (lsps[l] <= tmp) break;
1676  lsps[l + 1] = lsps[l];
1677  }
1678  lsps[l + 1] = tmp;
1679  }
1680  break;
1681  }
1682  }
1683 }
1684 
1685 /**
1686  * Synthesize output samples for a single superframe. If we have any data
1687  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1688  * in s->gb.
1689  *
1690  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1691  * to give a total of 480 samples per frame. See #synth_frame() for frame
1692  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1693  * (if these are globally specified for all frames (residually); they can
1694  * also be specified individually per-frame. See the s->has_residual_lsps
1695  * option), and can specify the number of samples encoded in this superframe
1696  * (if less than 480), usually used to prevent blanks at track boundaries.
1697  *
1698  * @param ctx WMA Voice decoder context
1699  * @return 0 on success, <0 on error or 1 if there was not enough data to
1700  * fully parse the superframe
1701  */
1703  int *got_frame_ptr)
1704 {
1705  WMAVoiceContext *s = ctx->priv_data;
1706  GetBitContext *gb = &s->gb, s_gb;
1707  int n, res, n_samples = MAX_SFRAMESIZE;
1708  double lsps[MAX_FRAMES][MAX_LSPS];
1709  const double *mean_lsf = s->lsps == 16 ?
1711  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1712  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1713  float *samples;
1714 
1715  memcpy(synth, s->synth_history,
1716  s->lsps * sizeof(*synth));
1717  memcpy(excitation, s->excitation_history,
1718  s->history_nsamples * sizeof(*excitation));
1719 
1720  if (s->sframe_cache_size > 0) {
1721  gb = &s_gb;
1723  s->sframe_cache_size = 0;
1724  }
1725 
1726  /* First bit is speech/music bit, it differentiates between WMAVoice
1727  * speech samples (the actual codec) and WMAVoice music samples, which
1728  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1729  * the wild yet. */
1730  if (!get_bits1(gb)) {
1731  avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1732  return AVERROR_PATCHWELCOME;
1733  }
1734 
1735  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1736  if (get_bits1(gb)) {
1737  if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1738  av_log(ctx, AV_LOG_ERROR,
1739  "Superframe encodes > %d samples (%d), not allowed\n",
1740  MAX_SFRAMESIZE, n_samples);
1741  return AVERROR_INVALIDDATA;
1742  }
1743  }
1744 
1745  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1746  if (s->has_residual_lsps) {
1747  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1748 
1749  for (n = 0; n < s->lsps; n++)
1750  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1751 
1752  if (s->lsps == 10) {
1753  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1754  } else /* s->lsps == 16 */
1755  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1756 
1757  for (n = 0; n < s->lsps; n++) {
1758  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1759  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1760  lsps[2][n] += mean_lsf[n];
1761  }
1762  for (n = 0; n < 3; n++)
1763  stabilize_lsps(lsps[n], s->lsps);
1764  }
1765 
1766  /* synth_superframe can run multiple times per packet
1767  * free potential previous frame */
1768  av_frame_unref(frame);
1769 
1770  /* get output buffer */
1771  frame->nb_samples = MAX_SFRAMESIZE;
1772  if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1773  return res;
1774  frame->nb_samples = n_samples;
1775  samples = (float *)frame->data[0];
1776 
1777  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1778  for (n = 0; n < 3; n++) {
1779  if (!s->has_residual_lsps) {
1780  int m;
1781 
1782  if (s->lsps == 10) {
1783  dequant_lsp10i(gb, lsps[n]);
1784  } else /* s->lsps == 16 */
1785  dequant_lsp16i(gb, lsps[n]);
1786 
1787  for (m = 0; m < s->lsps; m++)
1788  lsps[n][m] += mean_lsf[m];
1789  stabilize_lsps(lsps[n], s->lsps);
1790  }
1791 
1792  if ((res = synth_frame(ctx, gb, n,
1793  &samples[n * MAX_FRAMESIZE],
1794  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1795  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1796  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1797  *got_frame_ptr = 0;
1798  return res;
1799  }
1800  }
1801 
1802  /* Statistics? FIXME - we don't check for length, a slight overrun
1803  * will be caught by internal buffer padding, and anything else
1804  * will be skipped, not read. */
1805  if (get_bits1(gb)) {
1806  res = get_bits(gb, 4);
1807  skip_bits(gb, 10 * (res + 1));
1808  }
1809 
1810  if (get_bits_left(gb) < 0) {
1811  wmavoice_flush(ctx);
1812  return AVERROR_INVALIDDATA;
1813  }
1814 
1815  *got_frame_ptr = 1;
1816 
1817  /* Update history */
1818  memcpy(s->prev_lsps, lsps[2],
1819  s->lsps * sizeof(*s->prev_lsps));
1820  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1821  s->lsps * sizeof(*synth));
1822  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1823  s->history_nsamples * sizeof(*excitation));
1824  if (s->do_apf)
1825  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1826  s->history_nsamples * sizeof(*s->zero_exc_pf));
1827 
1828  return 0;
1829 }
1830 
1831 /**
1832  * Parse the packet header at the start of each packet (input data to this
1833  * decoder).
1834  *
1835  * @param s WMA Voice decoding context private data
1836  * @return <0 on error, nb_superframes on success.
1837  */
1839 {
1840  GetBitContext *gb = &s->gb;
1841  unsigned int res, n_superframes = 0;
1842 
1843  skip_bits(gb, 4); // packet sequence number
1844  s->has_residual_lsps = get_bits1(gb);
1845  do {
1846  res = get_bits(gb, 6); // number of superframes per packet
1847  // (minus first one if there is spillover)
1848  n_superframes += res;
1849  } while (res == 0x3F);
1851 
1852  return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1853 }
1854 
1855 /**
1856  * Copy (unaligned) bits from gb/data/size to pb.
1857  *
1858  * @param pb target buffer to copy bits into
1859  * @param data source buffer to copy bits from
1860  * @param size size of the source data, in bytes
1861  * @param gb bit I/O context specifying the current position in the source.
1862  * data. This function might use this to align the bit position to
1863  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1864  * source data
1865  * @param nbits the amount of bits to copy from source to target
1866  *
1867  * @note after calling this function, the current position in the input bit
1868  * I/O context is undefined.
1869  */
1870 static void copy_bits(PutBitContext *pb,
1871  const uint8_t *data, int size,
1872  GetBitContext *gb, int nbits)
1873 {
1874  int rmn_bytes, rmn_bits;
1875 
1876  rmn_bits = rmn_bytes = get_bits_left(gb);
1877  if (rmn_bits < nbits)
1878  return;
1879  if (nbits > pb->size_in_bits - put_bits_count(pb))
1880  return;
1881  rmn_bits &= 7; rmn_bytes >>= 3;
1882  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1883  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1884  avpriv_copy_bits(pb, data + size - rmn_bytes,
1885  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1886 }
1887 
1888 /**
1889  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1890  * and we expect that the demuxer / application provides it to us as such
1891  * (else you'll probably get garbage as output). Every packet has a size of
1892  * ctx->block_align bytes, starts with a packet header (see
1893  * #parse_packet_header()), and then a series of superframes. Superframe
1894  * boundaries may exceed packets, i.e. superframes can split data over
1895  * multiple (two) packets.
1896  *
1897  * For more information about frames, see #synth_superframe().
1898  */
1900  int *got_frame_ptr, AVPacket *avpkt)
1901 {
1902  WMAVoiceContext *s = ctx->priv_data;
1903  GetBitContext *gb = &s->gb;
1904  int size, res, pos;
1905 
1906  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1907  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1908  * feeds us ASF packets, which may concatenate multiple "codec" packets
1909  * in a single "muxer" packet, so we artificially emulate that by
1910  * capping the packet size at ctx->block_align. */
1911  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1912  init_get_bits8(&s->gb, avpkt->data, size);
1913 
1914  /* size == ctx->block_align is used to indicate whether we are dealing with
1915  * a new packet or a packet of which we already read the packet header
1916  * previously. */
1917  if (!(size % ctx->block_align)) { // new packet header
1918  if (!size) {
1919  s->spillover_nbits = 0;
1920  s->nb_superframes = 0;
1921  } else {
1922  if ((res = parse_packet_header(s)) < 0)
1923  return res;
1924  s->nb_superframes = res;
1925  }
1926 
1927  /* If the packet header specifies a s->spillover_nbits, then we want
1928  * to push out all data of the previous packet (+ spillover) before
1929  * continuing to parse new superframes in the current packet. */
1930  if (s->sframe_cache_size > 0) {
1931  int cnt = get_bits_count(gb);
1932  if (cnt + s->spillover_nbits > avpkt->size * 8) {
1933  s->spillover_nbits = avpkt->size * 8 - cnt;
1934  }
1935  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1936  flush_put_bits(&s->pb);
1938  if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1939  *got_frame_ptr) {
1940  cnt += s->spillover_nbits;
1941  s->skip_bits_next = cnt & 7;
1942  res = cnt >> 3;
1943  return res;
1944  } else
1945  skip_bits_long (gb, s->spillover_nbits - cnt +
1946  get_bits_count(gb)); // resync
1947  } else if (s->spillover_nbits) {
1948  skip_bits_long(gb, s->spillover_nbits); // resync
1949  }
1950  } else if (s->skip_bits_next)
1951  skip_bits(gb, s->skip_bits_next);
1952 
1953  /* Try parsing superframes in current packet */
1954  s->sframe_cache_size = 0;
1955  s->skip_bits_next = 0;
1956  pos = get_bits_left(gb);
1957  if (s->nb_superframes-- == 0) {
1958  *got_frame_ptr = 0;
1959  return size;
1960  } else if (s->nb_superframes > 0) {
1961  if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1962  return res;
1963  } else if (*got_frame_ptr) {
1964  int cnt = get_bits_count(gb);
1965  s->skip_bits_next = cnt & 7;
1966  res = cnt >> 3;
1967  return res;
1968  }
1969  } else if ((s->sframe_cache_size = pos) > 0) {
1970  /* ... cache it for spillover in next packet */
1972  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1973  // FIXME bad - just copy bytes as whole and add use the
1974  // skip_bits_next field
1975  }
1976 
1977  return size;
1978 }
1979 
1981 {
1982  WMAVoiceContext *s = ctx->priv_data;
1983 
1984  if (s->do_apf) {
1985  ff_rdft_end(&s->rdft);
1986  ff_rdft_end(&s->irdft);
1987  ff_dct_end(&s->dct);
1988  ff_dct_end(&s->dst);
1989  }
1990 
1991  return 0;
1992 }
1993 
1995  .name = "wmavoice",
1996  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1997  .type = AVMEDIA_TYPE_AUDIO,
1998  .id = AV_CODEC_ID_WMAVOICE,
1999  .priv_data_size = sizeof(WMAVoiceContext),
2001  .close = wmavoice_decode_end,
2004  .flush = wmavoice_flush,
2005 };
RDFTContext rdft
Definition: wmavoice.c:265
Description of frame types.
Definition: wmavoice.c:99
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply first set of pitch-adaptive window pulses.
Definition: wmavoice.c:1176
av_cold void ff_rdft_end(RDFTContext *s)
Definition: rdft.c:114
static const uint8_t wmavoice_dq_lsp16r2[0x500]
#define NULL
Definition: coverity.c:32
const char const char void * val
Definition: avisynth_c.h:863
int do_apf
whether to apply the averaged projection filter (APF)
Definition: wmavoice.c:149
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:59
static int pRNG(int frame_cntr, int block_num, int block_size)
Generate a random number from frame_cntr and block_idx, which will live in the range [0...
Definition: wmavoice.c:1237
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
Set up the variable bit mode (VBM) tree from container extradata.
Definition: wmavoice.c:300
void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP synthesis filter.
Definition: celp_filters.c:84
float gain_pred_err[6]
cache for gain prediction
Definition: wmavoice.c:250
This structure describes decoded (raw) audio or video data.
Definition: frame.h:295
int aw_next_pulse_off_cache
the position (relative to start of the second block) at which pulses should start to be positioned...
Definition: wmavoice.c:241
int nb_superframes
number of superframes in current packet
Definition: wmavoice.c:249
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:100
static void flush(AVCodecContext *avctx)
float postfilter_agc
gain control memory, used in adaptive_gain_control()
Definition: wmavoice.c:271
comfort noise during silence generated from a hardcoded (fixed) codebook with per-frame (low) gain va...
Definition: wmavoice.c:84
void ff_acelp_apply_order_2_transfer_function(float *out, const float *in, const float zero_coeffs[2], const float pole_coeffs[2], float gain, float mem[2], int n)
Apply an order 2 rational transfer function in-place.
static void put_bits(Jpeg2000EncoderContext *s, int val, int n)
put n times val bit
Definition: j2kenc.c:208
static unsigned int get_bits(GetBitContext *s, int n)
Read 1-25 bits.
Definition: get_bits.h:379
static void postfilter(WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch)
Averaging projection filter, the postfilter used in WMAVoice.
Definition: wmavoice.c:804
Memory handling functions.
void ff_weighted_vector_sumf(float *out, const float *in_a, const float *in_b, float weight_coeff_a, float weight_coeff_b, int length)
float implementation of weighted sum of two vectors.
static void skip_bits_long(GetBitContext *s, int n)
Skips the specified number of bits.
Definition: get_bits.h:291
static av_cold int init(AVCodecContext *avctx)
Definition: avrndec.c:35
#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)
Definition: vlc.h:75
#define avpriv_request_sample(...)
float synth_filter_out_buf[0x80+MAX_LSPS_ALIGN16]
aligned buffer for postfilter speech synthesis
Definition: wmavoice.c:283
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, const int *pitch)
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between th...
Definition: wmavoice.c:1034
static const int8_t pulses[4]
Number of non-zero pulses in the MP-MLQ excitation.
Definition: g723_1.h:723
int x[10]
Definition: acelp_vectors.h:55
int size
Definition: avcodec.h:1483
int aw_n_pulses[2]
number of AW-pulses in each block; note that this number can be negative (in which case it basically ...
Definition: wmavoice.c:236
static av_cold void wmavoice_init_static_data(void)
Definition: wmavoice.c:314
static int interpol(MBContext *s, uint32_t *color, int x, int y, int linesize)
void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
Copy the content of src to the bitstream.
Definition: bitstream.c:64
static void stabilize_lsps(double *lsps, int num)
Ensure minimum value for first item, maximum value for last value, proper spacing between each value ...
Definition: wmavoice.c:1656
static const float wmavoice_gain_codebook_fcb[128]
static const uint8_t wmavoice_dq_lsp16i1[0x640]
#define a1
Definition: regdef.h:47
static const uint8_t wmavoice_dq_lsp16r1[0x500]
int spillover_nbits
number of bits of the previous packet&#39;s last superframe preceding this packet&#39;s first full superframe...
Definition: wmavoice.c:188
void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
Add fixed vector to an array from a sparse representation.
int block_pitch_nbits
number of bits used to specify the first block&#39;s pitch value
Definition: wmavoice.c:167
static const uint8_t wmavoice_dq_lsp16i3[0x300]
float pitch_fac
Definition: acelp_vectors.h:59
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch.
Definition: wmavoice.c:70
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth)
Synthesize output samples for a single frame.
Definition: wmavoice.c:1480
static void calc_input_response(WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder)
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition: wmavoice.c:605
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
Parse 10 independently-coded LSPs.
Definition: wmavoice.c:889
int av_log2_16bit(unsigned v)
Definition: intmath.c:31
AVCodec.
Definition: avcodec.h:3494
#define MAX_LSPS_ALIGN16
same as MAX_LSPS; needs to be multiple
Definition: wmavoice.c:49
int block_align
number of bytes per packet if constant and known or 0 Used by some WAV based audio codecs...
Definition: avcodec.h:2267
static void decode(AVCodecContext *dec_ctx, AVPacket *pkt, AVFrame *frame, FILE *outfile)
Definition: decode_audio.c:71
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply second set of pitch-adaptive window pulses.
Definition: wmavoice.c:1086
uint8_t base
Definition: vp3data.h:202
static const float wmavoice_ipol1_coeffs[17 *9]
static const uint8_t wmavoice_dq_lsp16i2[0x3c0]
#define AV_CODEC_CAP_DELAY
Encoder or decoder requires flushing with NULL input at the end in order to give the complete and cor...
Definition: avcodec.h:1011
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:37
int spillover_bitsize
number of bits used to specify spillover_nbits in the packet header = ceil(log2(ctx->block_align << 3...
Definition: wmavoice.c:142
int block_delta_pitch_nbits
number of bits used to specify the delta pitch between this and the last block&#39;s pitch value...
Definition: wmavoice.c:170
enum AVSampleFormat sample_fmt
audio sample format
Definition: avcodec.h:2238
int mem
Definition: avisynth_c.h:916
uint8_t
#define av_cold
Definition: attributes.h:82
Sparse representation for the algebraic codebook (fixed) vector.
Definition: acelp_vectors.h:53
static const uint8_t wmavoice_dq_lsp16r3[0x600]
float delta
DCTContext dct
Definition: wmavoice.c:267
static const float wmavoice_gain_codebook_acb[128]
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
uint8_t log_n_blocks
log2(n_blocks)
Definition: wmavoice.c:102
int aw_first_pulse_off[2]
index of first sample to which to apply AW-pulses, or -0xff if unset
Definition: wmavoice.c:239
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:90
int has_residual_lsps
if set, superframes contain one set of LSPs that cover all frames, encoded as independent and residua...
Definition: wmavoice.c:192
float tilted_lpcs_pf[0x80]
aligned buffer for LPC tilting
Definition: wmavoice.c:279
uint8_t * extradata
some codecs need / can use extradata like Huffman tables.
Definition: avcodec.h:1671
static float tilt_factor(const float *lpcs, int n_lpcs)
Get the tilt factor of a formant filter from its transfer function.
Definition: wmavoice.c:592
#define u(width, name, range_min, range_max)
Definition: cbs_h2645.c:252
static const uint8_t wmavoice_dq_lsp10r[0x1400]
static void dequant_lsps(double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q)
Dequantize LSPs.
Definition: wmavoice.c:858
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:112
static const float wmavoice_ipol2_coeffs[32]
Hamming-window sinc function (num = 32, x = [ 0, 31 ]): (0.54 + 0.46 * cos(2 * M_PI * x / (num - 1)))...
uint8_t * data
Definition: avcodec.h:1482
static int get_bits_count(const GetBitContext *s)
Definition: get_bits.h:219
float dcf_mem[2]
DC filter history.
Definition: wmavoice.c:273
void av_memcpy_backptr(uint8_t *dst, int back, int cnt)
Overlapping memcpy() implementation.
Definition: mem.c:426
bitstream reader API header.
static av_cold void wmavoice_flush(AVCodecContext *ctx)
Definition: wmavoice.c:336
float synth_history[MAX_LSPS]
see excitation_history
Definition: wmavoice.c:255
#define max(a, b)
Definition: cuda_runtime.h:33
ptrdiff_t size
Definition: opengl_enc.c:100
hardcoded (fixed) codebook with per-block gain values
Definition: wmavoice.c:87
double prev_lsps[MAX_LSPS]
LSPs of the last frame of the previous superframe.
Definition: wmavoice.c:220
static void copy_bits(PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits)
Copy (unaligned) bits from gb/data/size to pb.
Definition: wmavoice.c:1870
#define AVOnce
Definition: thread.h:159
#define av_log(a,...)
static const uint16_t table[]
Definition: prosumer.c:206
#define expf(x)
Definition: libm.h:283
#define U(x)
Definition: vp56_arith.h:37
static int get_bits_left(GetBitContext *gb)
Definition: get_bits.h:849
int size_in_bits
Definition: put_bits.h:39
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
static const double wmavoice_mean_lsf16[2][16]
int sframe_cache_size
set to >0 if we have data from an (incomplete) superframe from a previous packet that spilled over in...
Definition: wmavoice.c:204
static const float wmavoice_lsp10_intercoeff_b[32][2][10]
int block_pitch_range
range of the block pitch
Definition: wmavoice.c:169
static const float wmavoice_std_codebook[1000]
static const int sizes[][2]
Definition: img2dec.c:53
int last_acb_type
frame type [0-2] of the previous frame
Definition: wmavoice.c:223
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
Definition: internal.h:186
static const float wmavoice_gain_silence[256]
int denoise_filter_cache_size
samples in denoise_filter_cache
Definition: wmavoice.c:278
int history_nsamples
number of samples in history for signal prediction (through ACB)
Definition: wmavoice.c:145
static const uint8_t wmavoice_dq_lsp10i[0xf00]
Definition: wmavoice_data.h:33
static const float wmavoice_lsp10_intercoeff_a[32][2][10]
#define t1
Definition: regdef.h:29
static const float wmavoice_energy_table[128]
LUT for 1.071575641632 * pow(1.0331663, n - 127)
Windows Media Voice (WMAVoice) tables.
Definition: avfft.h:73
const char * name
Name of the codec implementation.
Definition: avcodec.h:3501
uint8_t bits
Definition: vp3data.h:202
int no_repeat_mask
Definition: acelp_vectors.h:57
int denoise_tilt_corr
Whether to apply tilt correction to the Wiener filter coefficients (postfilter)
Definition: wmavoice.c:153
int aw_idx_is_ext
whether the AW index was encoded in 8 bits (instead of 6)
Definition: wmavoice.c:228
#define t3
Definition: regdef.h:31
#define FFMAX(a, b)
Definition: common.h:94
uint16_t block_conv_table[4]
boundaries for block pitch unit/scale conversion
Definition: wmavoice.c:176
#define MUL16(ra, rb)
Definition: mathops.h:88
DCTContext dst
contexts for phase shift (in Hilbert transform, part of postfilter)
Definition: wmavoice.c:267
int lsp_def_mode
defines different sets of LSP defaults [0, 1]
Definition: wmavoice.c:160
Definition: vlc.h:26
uint64_t channel_layout
Audio channel layout.
Definition: avcodec.h:2281
static int put_bits_count(PutBitContext *s)
Definition: put_bits.h:67
#define powf(x, y)
Definition: libm.h:50
int skip_bits_next
number of bits to skip at the next call to wmavoice_decode_packet() (since they&#39;re part of the previo...
Definition: wmavoice.c:197
static void dequant_lsp16r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:984
int min_pitch_val
base value for pitch parsing code
Definition: wmavoice.c:163
WMA Voice decoding context.
Definition: wmavoice.c:131
static void wiener_denoise(WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs)
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it...
Definition: wmavoice.c:722
int denoise_strength
strength of denoising in Wiener filter [0-11]
Definition: wmavoice.c:151
uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE+AV_INPUT_BUFFER_PADDING_SIZE]
cache for superframe data split over multiple packets
Definition: wmavoice.c:201
audio channel layout utility functions
Definition: avfft.h:97
#define FFMIN(a, b)
Definition: common.h:96
#define log_range(var, assign)
#define MAX_LSPS
maximum filter order
Definition: wmavoice.c:48
static VLC frame_type_vlc
Frame type VLC coding.
Definition: wmavoice.c:63
int pitch_nbits
number of bits used to specify the pitch value in the frame header
Definition: wmavoice.c:165
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return values
#define MAX_BLOCKS
maximum number of blocks per frame
Definition: wmavoice.c:47
float denoise_coeffs_pf[0x80]
aligned buffer for denoise coefficients
Definition: wmavoice.c:281
void(* dct_calc)(struct DCTContext *s, FFTSample *data)
Definition: dct.h:38
static void dequant_lsp10r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:915
float y[10]
Definition: acelp_vectors.h:56
AVFormatContext * ctx
Definition: movenc.c:48
static av_always_inline unsigned UMULH(unsigned a, unsigned b)
Definition: mathops.h:68
#define a2
Definition: regdef.h:48
Definition: dct.h:32
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
float sin[511]
Definition: wmavoice.c:269
#define s(width, name)
Definition: cbs_vp9.c:257
static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE(*table)[2], int bits, int max_depth)
Parse a vlc code.
Definition: get_bits.h:797
Definition: avfft.h:72
int n
Definition: avisynth_c.h:760
void(* rdft_calc)(struct RDFTContext *s, FFTSample *z)
Definition: rdft.h:38
static int kalman_smoothen(WMAVoiceContext *s, int pitch, const float *in, float *out, int size)
Kalman smoothing function.
Definition: wmavoice.c:546
void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
Apply tilt compensation filter, 1 - tilt * z-1.
if(ret)
static const float wmavoice_gain_universal[64]
void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
Reconstruct LPC coefficients from the line spectral pair frequencies.
Definition: lsp.c:209
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
Set up decoder with parameters from demuxer (extradata etc.).
Definition: wmavoice.c:367
#define AVERROR_PATCHWELCOME
Not yet implemented in FFmpeg, patches welcome.
Definition: error.h:62
static const uint8_t last_coeff[3]
Definition: qdm2data.h:257
static const struct frame_type_desc frame_descs[17]
#define AV_ONCE_INIT
Definition: thread.h:160
float denoise_filter_cache[MAX_FRAMESIZE]
Definition: wmavoice.c:277
Libavcodec external API header.
int sample_rate
samples per second
Definition: avcodec.h:2230
void AAC_RENAME() ff_sine_window_init(INTFLOAT *window, int n)
Generate a sine window.
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt)
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer...
Definition: wmavoice.c:1899
static int init_get_bits8(GetBitContext *s, const uint8_t *buffer, int byte_size)
Initialize GetBitContext.
Definition: get_bits.h:677
#define abs(x)
Definition: cuda_runtime.h:35
static const int16_t alpha[]
Definition: ilbcdata.h:55
main external API structure.
Definition: avcodec.h:1570
static int parse_packet_header(WMAVoiceContext *s)
Parse the packet header at the start of each packet (input data to this decoder). ...
Definition: wmavoice.c:1838
int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
Get a buffer for a frame.
Definition: decode.c:1968
AVCodec ff_wmavoice_decoder
Definition: wmavoice.c:1994
int8_t vbm_tree[25]
converts VLC codes to frame type
Definition: wmavoice.c:140
int extradata_size
Definition: avcodec.h:1672
static unsigned int get_bits1(GetBitContext *s)
Definition: get_bits.h:498
static void synth_block(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth)
Parse data in a single block.
Definition: wmavoice.c:1438
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
Definition: wmavoice.c:1980
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
static void skip_bits(GetBitContext *s, int n)
Definition: get_bits.h:467
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
Set up DCT.
Definition: dct.c:177
#define AV_CODEC_CAP_SUBFRAMES
Codec can output multiple frames per AVPacket Normally demuxers return one frame at a time...
Definition: avcodec.h:1029
int pitch_diff_sh16
((cur_pitch_val - last_pitch_val) << 16) / MAX_FRAMESIZE
Definition: wmavoice.c:224
static int init_get_bits(GetBitContext *s, const uint8_t *buffer, int bit_size)
Initialize GetBitContext.
Definition: get_bits.h:659
#define MAX_SFRAMESIZE
maximum number of samples per superframe
Definition: wmavoice.c:54
int lsp_q_mode
defines quantizer defaults [0, 1]
Definition: wmavoice.c:159
int frame_cntr
current frame index [0 - 0xFFFE]; is only used for comfort noise in pRNG()
Definition: wmavoice.c:247
void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP zero synthesis filter.
Definition: celp_filters.c:199
float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len)
Return the scalar product of two vectors.
Definition: float_dsp.c:124
static void adaptive_gain_control(float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem)
Adaptive gain control (as used in postfilter).
Definition: wmavoice.c:505
Innovation (fixed) codebook pulse sets in combinations of either single pulses or pulse pairs...
Definition: wmavoice.c:91
static const float mean_lsf[10]
Definition: siprdata.h:27
#define SFRAME_CACHE_MAXSIZE
maximum cache size for frame data that
Definition: wmavoice.c:56
void av_frame_unref(AVFrame *frame)
Unreference all the buffers referenced by frame and reset the frame fields.
Definition: frame.c:553
uint8_t fcb_type
Fixed codebook type (FCB_TYPE_*)
Definition: wmavoice.c:104
#define flags(name, subs,...)
Definition: cbs_av1.c:561
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
Parse 16 independently-coded LSPs.
Definition: wmavoice.c:951
no adaptive codebook (only hardcoded fixed)
Definition: wmavoice.c:69
RDFTContext irdft
contexts for FFT-calculation in the postfilter (for denoise filter)
Definition: wmavoice.c:265
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:309
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr)
Synthesize output samples for a single superframe.
Definition: wmavoice.c:1702
#define M_LN10
Definition: mathematics.h:43
Pitch-adaptive window (AW) pulse signals, used in particular for low-bitrate streams.
Definition: wmavoice.c:89
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation)
Parse hardcoded signal for a single block.
Definition: wmavoice.c:1273
uint8_t n_blocks
amount of blocks per frame (each block (contains 160/n_blocks samples)
Definition: wmavoice.c:100
common internal api header.
static void flush_put_bits(PutBitContext *s)
Pad the end of the output stream with zeros.
Definition: put_bits.h:101
int pitch_lag
Definition: acelp_vectors.h:58
float excitation_history[MAX_SIGNAL_HISTORY]
cache of the signal of previous superframes, used as a history for signal generation ...
Definition: wmavoice.c:251
static void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
Initialize the PutBitContext s.
Definition: put_bits.h:48
int last_pitch_val
pitch value of the previous frame
Definition: wmavoice.c:222
#define AV_INPUT_BUFFER_PADDING_SIZE
Required number of additionally allocated bytes at the end of the input bitstream for decoding...
Definition: avcodec.h:795
void * priv_data
Definition: avcodec.h:1597
#define MAX_FRAMESIZE
maximum number of samples per frame
Definition: wmavoice.c:52
float silence_gain
set for use in blocks if ACB_TYPE_NONE
Definition: wmavoice.c:226
static const double wmavoice_mean_lsf10[2][10]
static void dct(AudioRNNContext *s, float *out, const float *in)
Definition: af_arnndn.c:992
int len
int channels
number of audio channels
Definition: avcodec.h:2231
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:162
VLC_TYPE(* table)[2]
code, bits
Definition: vlc.h:28
#define lrint
Definition: tablegen.h:53
av_cold void ff_dct_end(DCTContext *s)
Definition: dct.c:221
void ff_acelp_interpolatef(float *out, const float *in, const float *filter_coeffs, int precision, int frac_pos, int filter_length, int length)
Floating point version of ff_acelp_interpolate()
Definition: acelp_filters.c:78
int block_delta_pitch_hrange
1/2 range of the delta (full range is from -this to +this-1)
Definition: wmavoice.c:174
int max_pitch_val
max value + 1 for pitch parsing
Definition: wmavoice.c:164
#define av_uninit(x)
Definition: attributes.h:148
Per-block pitch with signal generation using a Hamming sinc window function.
Definition: wmavoice.c:75
int lsps
number of LSPs per frame [10 or 16]
Definition: wmavoice.c:158
FILE * out
Definition: movenc.c:54
#define MAX_FRAMES
maximum number of frames per superframe
Definition: wmavoice.c:51
Filter the word “frame” indicates either a video frame or a group of audio samples
static const float wmavoice_lsp16_intercoeff_b[32][2][16]
PutBitContext pb
bitstream writer for sframe_cache
Definition: wmavoice.c:209
#define M_PI
Definition: mathematics.h:52
uint8_t acb_type
Adaptive codebook type (ACB_TYPE_*)
Definition: wmavoice.c:103
static const float wmavoice_denoise_power_table[12][64]
LUT for f(x,y) = pow((y + 6.9) / 64, 0.025 * (x + 1)).
int dc_level
Predicted amount of DC noise, based on which a DC removal filter is used.
Definition: wmavoice.c:155
#define VLC_NBITS
number of bits to read per VLC iteration
Definition: wmavoice.c:58
static const float wmavoice_lsp16_intercoeff_a[32][2][16]
Definition: avfft.h:96
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later.That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another.Frame references ownership and permissions
float cos[511]
8-bit cosine/sine windows over [-pi,pi] range
Definition: wmavoice.c:269
#define AV_CH_LAYOUT_MONO
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
Set up a real FFT.
Definition: rdft.c:88
int aw_pulse_range
the range over which aw_pulse_set1() can apply the pulse, relative to the value in aw_first_pulse_off...
Definition: wmavoice.c:230
float min
uint64_t_TMPL AV_WL64 unsigned int_TMPL AV_RL32
Definition: bytestream.h:87
This structure stores compressed data.
Definition: avcodec.h:1459
int nb_samples
number of audio samples (per channel) described by this frame
Definition: frame.h:361
float zero_exc_pf[MAX_SIGNAL_HISTORY+MAX_SFRAMESIZE]
zero filter output (i.e.
Definition: wmavoice.c:274
#define AV_CODEC_CAP_DR1
Codec uses get_buffer() for allocating buffers and supports custom allocators.
Definition: avcodec.h:986
for(j=16;j >0;--j)
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation)
Parse FCB/ACB signal for a single block.
Definition: wmavoice.c:1304
uint8_t dbl_pulses
how many pulse vectors have pulse pairs (rather than just one single pulse) only if fcb_type == FCB_T...
Definition: wmavoice.c:105
#define t2
Definition: regdef.h:30
#define MAX_SIGNAL_HISTORY
maximum excitation signal history
Definition: wmavoice.c:53
#define MULH
Definition: mathops.h:42
GetBitContext gb
packet bitreader.
Definition: wmavoice.c:136
static uint8_t tmp[11]
Definition: aes_ctr.c:26
bitstream writer API