FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #define UNCHECKED_BITSTREAM_READER 1
29 
30 #include <math.h>
31 
33 #include "libavutil/mem.h"
34 #include "dsputil.h"
35 #include "avcodec.h"
36 #include "internal.h"
37 #include "get_bits.h"
38 #include "put_bits.h"
39 #include "wmavoice_data.h"
40 #include "celp_filters.h"
41 #include "acelp_vectors.h"
42 #include "acelp_filters.h"
43 #include "lsp.h"
44 #include "dct.h"
45 #include "rdft.h"
46 #include "sinewin.h"
47 
48 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
49 #define MAX_LSPS 16 ///< maximum filter order
50 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
51  ///< of 16 for ASM input buffer alignment
52 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
53 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
54 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
55 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
56  ///< maximum number of samples per superframe
57 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
58  ///< was split over two packets
59 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
60 
61 /**
62  * Frame type VLC coding.
63  */
65 
66 /**
67  * Adaptive codebook types.
68  */
69 enum {
70  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
71  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
72  ///< we interpolate to get a per-sample pitch.
73  ///< Signal is generated using an asymmetric sinc
74  ///< window function
75  ///< @note see #wmavoice_ipol1_coeffs
76  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
77  ///< a Hamming sinc window function
78  ///< @note see #wmavoice_ipol2_coeffs
79 };
80 
81 /**
82  * Fixed codebook types.
83  */
84 enum {
85  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
86  ///< generated from a hardcoded (fixed) codebook
87  ///< with per-frame (low) gain values
88  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
89  ///< gain values
90  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
91  ///< used in particular for low-bitrate streams
92  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
93  ///< combinations of either single pulses or
94  ///< pulse pairs
95 };
96 
97 /**
98  * Description of frame types.
99  */
100 static const struct frame_type_desc {
101  uint8_t n_blocks; ///< amount of blocks per frame (each block
102  ///< (contains 160/#n_blocks samples)
103  uint8_t log_n_blocks; ///< log2(#n_blocks)
104  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
105  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
106  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
107  ///< (rather than just one single pulse)
108  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
109  uint16_t frame_size; ///< the amount of bits that make up the block
110  ///< data (per frame)
111 } frame_descs[17] = {
112  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
113  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
114  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
115  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
116  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
117  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
118  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
119  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
120  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
121  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
122  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
123  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
124  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
125  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
126  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
127  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
128  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
129 };
130 
131 /**
132  * WMA Voice decoding context.
133  */
134 typedef struct {
135  /**
136  * @name Global values specified in the stream header / extradata or used all over.
137  * @{
138  */
140  GetBitContext gb; ///< packet bitreader. During decoder init,
141  ///< it contains the extradata from the
142  ///< demuxer. During decoding, it contains
143  ///< packet data.
144  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
145 
146  int spillover_bitsize; ///< number of bits used to specify
147  ///< #spillover_nbits in the packet header
148  ///< = ceil(log2(ctx->block_align << 3))
149  int history_nsamples; ///< number of samples in history for signal
150  ///< prediction (through ACB)
151 
152  /* postfilter specific values */
153  int do_apf; ///< whether to apply the averaged
154  ///< projection filter (APF)
155  int denoise_strength; ///< strength of denoising in Wiener filter
156  ///< [0-11]
157  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
158  ///< Wiener filter coefficients (postfilter)
159  int dc_level; ///< Predicted amount of DC noise, based
160  ///< on which a DC removal filter is used
161 
162  int lsps; ///< number of LSPs per frame [10 or 16]
163  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
164  int lsp_def_mode; ///< defines different sets of LSP defaults
165  ///< [0, 1]
166  int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
167  ///< per-frame (independent coding)
168  int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
169  ///< per superframe (residual coding)
170 
171  int min_pitch_val; ///< base value for pitch parsing code
172  int max_pitch_val; ///< max value + 1 for pitch parsing
173  int pitch_nbits; ///< number of bits used to specify the
174  ///< pitch value in the frame header
175  int block_pitch_nbits; ///< number of bits used to specify the
176  ///< first block's pitch value
177  int block_pitch_range; ///< range of the block pitch
178  int block_delta_pitch_nbits; ///< number of bits used to specify the
179  ///< delta pitch between this and the last
180  ///< block's pitch value, used in all but
181  ///< first block
182  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
183  ///< from -this to +this-1)
184  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
185  ///< conversion
186 
187  /**
188  * @}
189  *
190  * @name Packet values specified in the packet header or related to a packet.
191  *
192  * A packet is considered to be a single unit of data provided to this
193  * decoder by the demuxer.
194  * @{
195  */
196  int spillover_nbits; ///< number of bits of the previous packet's
197  ///< last superframe preceding this
198  ///< packet's first full superframe (useful
199  ///< for re-synchronization also)
200  int has_residual_lsps; ///< if set, superframes contain one set of
201  ///< LSPs that cover all frames, encoded as
202  ///< independent and residual LSPs; if not
203  ///< set, each frame contains its own, fully
204  ///< independent, LSPs
205  int skip_bits_next; ///< number of bits to skip at the next call
206  ///< to #wmavoice_decode_packet() (since
207  ///< they're part of the previous superframe)
208 
210  ///< cache for superframe data split over
211  ///< multiple packets
212  int sframe_cache_size; ///< set to >0 if we have data from an
213  ///< (incomplete) superframe from a previous
214  ///< packet that spilled over in the current
215  ///< packet; specifies the amount of bits in
216  ///< #sframe_cache
217  PutBitContext pb; ///< bitstream writer for #sframe_cache
218 
219  /**
220  * @}
221  *
222  * @name Frame and superframe values
223  * Superframe and frame data - these can change from frame to frame,
224  * although some of them do in that case serve as a cache / history for
225  * the next frame or superframe.
226  * @{
227  */
228  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
229  ///< superframe
230  int last_pitch_val; ///< pitch value of the previous frame
231  int last_acb_type; ///< frame type [0-2] of the previous frame
232  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
233  ///< << 16) / #MAX_FRAMESIZE
234  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
235 
236  int aw_idx_is_ext; ///< whether the AW index was encoded in
237  ///< 8 bits (instead of 6)
238  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
239  ///< can apply the pulse, relative to the
240  ///< value in aw_first_pulse_off. The exact
241  ///< position of the first AW-pulse is within
242  ///< [pulse_off, pulse_off + this], and
243  ///< depends on bitstream values; [16 or 24]
244  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
245  ///< that this number can be negative (in
246  ///< which case it basically means "zero")
247  int aw_first_pulse_off[2]; ///< index of first sample to which to
248  ///< apply AW-pulses, or -0xff if unset
249  int aw_next_pulse_off_cache; ///< the position (relative to start of the
250  ///< second block) at which pulses should
251  ///< start to be positioned, serves as a
252  ///< cache for pitch-adaptive window pulses
253  ///< between blocks
254 
255  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
256  ///< only used for comfort noise in #pRNG()
257  float gain_pred_err[6]; ///< cache for gain prediction
258  float excitation_history[MAX_SIGNAL_HISTORY];
259  ///< cache of the signal of previous
260  ///< superframes, used as a history for
261  ///< signal generation
262  float synth_history[MAX_LSPS]; ///< see #excitation_history
263  /**
264  * @}
265  *
266  * @name Postfilter values
267  *
268  * Variables used for postfilter implementation, mostly history for
269  * smoothing and so on, and context variables for FFT/iFFT.
270  * @{
271  */
272  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
273  ///< postfilter (for denoise filter)
274  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
275  ///< transform, part of postfilter)
276  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
277  ///< range
278  float postfilter_agc; ///< gain control memory, used in
279  ///< #adaptive_gain_control()
280  float dcf_mem[2]; ///< DC filter history
281  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
282  ///< zero filter output (i.e. excitation)
283  ///< by postfilter
284  float denoise_filter_cache[MAX_FRAMESIZE];
285  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
286  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
287  ///< aligned buffer for LPC tilting
288  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
289  ///< aligned buffer for denoise coefficients
290  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
291  ///< aligned buffer for postfilter speech
292  ///< synthesis
293  /**
294  * @}
295  */
297 
298 /**
299  * Set up the variable bit mode (VBM) tree from container extradata.
300  * @param gb bit I/O context.
301  * The bit context (s->gb) should be loaded with byte 23-46 of the
302  * container extradata (i.e. the ones containing the VBM tree).
303  * @param vbm_tree pointer to array to which the decoded VBM tree will be
304  * written.
305  * @return 0 on success, <0 on error.
306  */
307 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
308 {
309  static const uint8_t bits[] = {
310  2, 2, 2, 4, 4, 4,
311  6, 6, 6, 8, 8, 8,
312  10, 10, 10, 12, 12, 12,
313  14, 14, 14, 14
314  };
315  static const uint16_t codes[] = {
316  0x0000, 0x0001, 0x0002, // 00/01/10
317  0x000c, 0x000d, 0x000e, // 11+00/01/10
318  0x003c, 0x003d, 0x003e, // 1111+00/01/10
319  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
320  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
321  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
322  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
323  };
324  int cntr[8] = { 0 }, n, res;
325 
326  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
327  for (n = 0; n < 17; n++) {
328  res = get_bits(gb, 3);
329  if (cntr[res] > 3) // should be >= 3 + (res == 7))
330  return -1;
331  vbm_tree[res * 3 + cntr[res]++] = n;
332  }
333  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
334  bits, 1, 1, codes, 2, 2, 132);
335  return 0;
336 }
337 
338 /**
339  * Set up decoder with parameters from demuxer (extradata etc.).
340  */
342 {
343  int n, flags, pitch_range, lsp16_flag;
344  WMAVoiceContext *s = ctx->priv_data;
345 
346  /**
347  * Extradata layout:
348  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
349  * - byte 19-22: flags field (annoyingly in LE; see below for known
350  * values),
351  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
352  * rest is 0).
353  */
354  if (ctx->extradata_size != 46) {
355  av_log(ctx, AV_LOG_ERROR,
356  "Invalid extradata size %d (should be 46)\n",
357  ctx->extradata_size);
358  return -1;
359  }
360  flags = AV_RL32(ctx->extradata + 18);
361  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
362  s->do_apf = flags & 0x1;
363  if (s->do_apf) {
364  ff_rdft_init(&s->rdft, 7, DFT_R2C);
365  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
366  ff_dct_init(&s->dct, 6, DCT_I);
367  ff_dct_init(&s->dst, 6, DST_I);
368 
369  ff_sine_window_init(s->cos, 256);
370  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
371  for (n = 0; n < 255; n++) {
372  s->sin[n] = -s->sin[510 - n];
373  s->cos[510 - n] = s->cos[n];
374  }
375  }
376  s->denoise_strength = (flags >> 2) & 0xF;
377  if (s->denoise_strength >= 12) {
378  av_log(ctx, AV_LOG_ERROR,
379  "Invalid denoise filter strength %d (max=11)\n",
380  s->denoise_strength);
381  return -1;
382  }
383  s->denoise_tilt_corr = !!(flags & 0x40);
384  s->dc_level = (flags >> 7) & 0xF;
385  s->lsp_q_mode = !!(flags & 0x2000);
386  s->lsp_def_mode = !!(flags & 0x4000);
387  lsp16_flag = flags & 0x1000;
388  if (lsp16_flag) {
389  s->lsps = 16;
390  s->frame_lsp_bitsize = 34;
391  s->sframe_lsp_bitsize = 60;
392  } else {
393  s->lsps = 10;
394  s->frame_lsp_bitsize = 24;
395  s->sframe_lsp_bitsize = 48;
396  }
397  for (n = 0; n < s->lsps; n++)
398  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
399 
400  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
401  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
402  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
403  return -1;
404  }
405 
406  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
407  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
408  pitch_range = s->max_pitch_val - s->min_pitch_val;
409  if (pitch_range <= 0) {
410  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
411  return -1;
412  }
413  s->pitch_nbits = av_ceil_log2(pitch_range);
414  s->last_pitch_val = 40;
416  s->history_nsamples = s->max_pitch_val + 8;
417 
419  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
420  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
421 
422  av_log(ctx, AV_LOG_ERROR,
423  "Unsupported samplerate %d (min=%d, max=%d)\n",
424  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
425 
426  return -1;
427  }
428 
429  s->block_conv_table[0] = s->min_pitch_val;
430  s->block_conv_table[1] = (pitch_range * 25) >> 6;
431  s->block_conv_table[2] = (pitch_range * 44) >> 6;
432  s->block_conv_table[3] = s->max_pitch_val - 1;
433  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
434  if (s->block_delta_pitch_hrange <= 0) {
435  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
436  return -1;
437  }
438  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
440  s->block_conv_table[3] + 1 +
441  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
442  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
443 
444  ctx->channels = 1;
447 
449  ctx->coded_frame = &s->frame;
450 
451  return 0;
452 }
453 
454 /**
455  * @name Postfilter functions
456  * Postfilter functions (gain control, wiener denoise filter, DC filter,
457  * kalman smoothening, plus surrounding code to wrap it)
458  * @{
459  */
460 /**
461  * Adaptive gain control (as used in postfilter).
462  *
463  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
464  * that the energy here is calculated using sum(abs(...)), whereas the
465  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
466  *
467  * @param out output buffer for filtered samples
468  * @param in input buffer containing the samples as they are after the
469  * postfilter steps so far
470  * @param speech_synth input buffer containing speech synth before postfilter
471  * @param size input buffer size
472  * @param alpha exponential filter factor
473  * @param gain_mem pointer to filter memory (single float)
474  */
475 static void adaptive_gain_control(float *out, const float *in,
476  const float *speech_synth,
477  int size, float alpha, float *gain_mem)
478 {
479  int i;
480  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
481  float mem = *gain_mem;
482 
483  for (i = 0; i < size; i++) {
484  speech_energy += fabsf(speech_synth[i]);
485  postfilter_energy += fabsf(in[i]);
486  }
487  gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
488 
489  for (i = 0; i < size; i++) {
490  mem = alpha * mem + gain_scale_factor;
491  out[i] = in[i] * mem;
492  }
493 
494  *gain_mem = mem;
495 }
496 
497 /**
498  * Kalman smoothing function.
499  *
500  * This function looks back pitch +/- 3 samples back into history to find
501  * the best fitting curve (that one giving the optimal gain of the two
502  * signals, i.e. the highest dot product between the two), and then
503  * uses that signal history to smoothen the output of the speech synthesis
504  * filter.
505  *
506  * @param s WMA Voice decoding context
507  * @param pitch pitch of the speech signal
508  * @param in input speech signal
509  * @param out output pointer for smoothened signal
510  * @param size input/output buffer size
511  *
512  * @returns -1 if no smoothening took place, e.g. because no optimal
513  * fit could be found, or 0 on success.
514  */
515 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
516  const float *in, float *out, int size)
517 {
518  int n;
519  float optimal_gain = 0, dot;
520  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
521  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
522  *best_hist_ptr = NULL;
523 
524  /* find best fitting point in history */
525  do {
526  dot = ff_scalarproduct_float_c(in, ptr, size);
527  if (dot > optimal_gain) {
528  optimal_gain = dot;
529  best_hist_ptr = ptr;
530  }
531  } while (--ptr >= end);
532 
533  if (optimal_gain <= 0)
534  return -1;
535  dot = ff_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
536  if (dot <= 0) // would be 1.0
537  return -1;
538 
539  if (optimal_gain <= dot) {
540  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
541  } else
542  dot = 0.625;
543 
544  /* actual smoothing */
545  for (n = 0; n < size; n++)
546  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
547 
548  return 0;
549 }
550 
551 /**
552  * Get the tilt factor of a formant filter from its transfer function
553  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
554  * but somehow (??) it does a speech synthesis filter in the
555  * middle, which is missing here
556  *
557  * @param lpcs LPC coefficients
558  * @param n_lpcs Size of LPC buffer
559  * @returns the tilt factor
560  */
561 static float tilt_factor(const float *lpcs, int n_lpcs)
562 {
563  float rh0, rh1;
564 
565  rh0 = 1.0 + ff_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
566  rh1 = lpcs[0] + ff_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
567 
568  return rh1 / rh0;
569 }
570 
571 /**
572  * Derive denoise filter coefficients (in real domain) from the LPCs.
573  */
574 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
575  int fcb_type, float *coeffs, int remainder)
576 {
577  float last_coeff, min = 15.0, max = -15.0;
578  float irange, angle_mul, gain_mul, range, sq;
579  int n, idx;
580 
581  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
582  s->rdft.rdft_calc(&s->rdft, lpcs);
583 #define log_range(var, assign) do { \
584  float tmp = log10f(assign); var = tmp; \
585  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
586  } while (0)
587  log_range(last_coeff, lpcs[1] * lpcs[1]);
588  for (n = 1; n < 64; n++)
589  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
590  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
591  log_range(lpcs[0], lpcs[0] * lpcs[0]);
592 #undef log_range
593  range = max - min;
594  lpcs[64] = last_coeff;
595 
596  /* Now, use this spectrum to pick out these frequencies with higher
597  * (relative) power/energy (which we then take to be "not noise"),
598  * and set up a table (still in lpc[]) of (relative) gains per frequency.
599  * These frequencies will be maintained, while others ("noise") will be
600  * decreased in the filter output. */
601  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
602  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
603  (5.0 / 14.7));
604  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
605  for (n = 0; n <= 64; n++) {
606  float pwr;
607 
608  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
610  lpcs[n] = angle_mul * pwr;
611 
612  /* 70.57 =~ 1/log10(1.0331663) */
613  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
614  if (idx > 127) { // fallback if index falls outside table range
615  coeffs[n] = wmavoice_energy_table[127] *
616  powf(1.0331663, idx - 127);
617  } else
618  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
619  }
620 
621  /* calculate the Hilbert transform of the gains, which we do (since this
622  * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
623  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
624  * "moment" of the LPCs in this filter. */
625  s->dct.dct_calc(&s->dct, lpcs);
626  s->dst.dct_calc(&s->dst, lpcs);
627 
628  /* Split out the coefficient indexes into phase/magnitude pairs */
629  idx = 255 + av_clip(lpcs[64], -255, 255);
630  coeffs[0] = coeffs[0] * s->cos[idx];
631  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
632  last_coeff = coeffs[64] * s->cos[idx];
633  for (n = 63;; n--) {
634  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
635  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
636  coeffs[n * 2] = coeffs[n] * s->cos[idx];
637 
638  if (!--n) break;
639 
640  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
641  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
642  coeffs[n * 2] = coeffs[n] * s->cos[idx];
643  }
644  coeffs[1] = last_coeff;
645 
646  /* move into real domain */
647  s->irdft.rdft_calc(&s->irdft, coeffs);
648 
649  /* tilt correction and normalize scale */
650  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
651  if (s->denoise_tilt_corr) {
652  float tilt_mem = 0;
653 
654  coeffs[remainder - 1] = 0;
655  ff_tilt_compensation(&tilt_mem,
656  -1.8 * tilt_factor(coeffs, remainder - 1),
657  coeffs, remainder);
658  }
659  sq = (1.0 / 64.0) * sqrtf(1 / ff_scalarproduct_float_c(coeffs, coeffs, remainder));
660  for (n = 0; n < remainder; n++)
661  coeffs[n] *= sq;
662 }
663 
664 /**
665  * This function applies a Wiener filter on the (noisy) speech signal as
666  * a means to denoise it.
667  *
668  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
669  * - using this power spectrum, calculate (for each frequency) the Wiener
670  * filter gain, which depends on the frequency power and desired level
671  * of noise subtraction (when set too high, this leads to artifacts)
672  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
673  * of 4-8kHz);
674  * - by doing a phase shift, calculate the Hilbert transform of this array
675  * of per-frequency filter-gains to get the filtering coefficients;
676  * - smoothen/normalize/de-tilt these filter coefficients as desired;
677  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
678  * to get the denoised speech signal;
679  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
680  * the frame boundary) are saved and applied to subsequent frames by an
681  * overlap-add method (otherwise you get clicking-artifacts).
682  *
683  * @param s WMA Voice decoding context
684  * @param fcb_type Frame (codebook) type
685  * @param synth_pf input: the noisy speech signal, output: denoised speech
686  * data; should be 16-byte aligned (for ASM purposes)
687  * @param size size of the speech data
688  * @param lpcs LPCs used to synthesize this frame's speech data
689  */
690 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
691  float *synth_pf, int size,
692  const float *lpcs)
693 {
694  int remainder, lim, n;
695 
696  if (fcb_type != FCB_TYPE_SILENCE) {
697  float *tilted_lpcs = s->tilted_lpcs_pf,
698  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
699 
700  tilted_lpcs[0] = 1.0;
701  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
702  memset(&tilted_lpcs[s->lsps + 1], 0,
703  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
704  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
705  tilted_lpcs, s->lsps + 2);
706 
707  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
708  * size is applied to the next frame. All input beyond this is zero,
709  * and thus all output beyond this will go towards zero, hence we can
710  * limit to min(size-1, 127-size) as a performance consideration. */
711  remainder = FFMIN(127 - size, size - 1);
712  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
713 
714  /* apply coefficients (in frequency spectrum domain), i.e. complex
715  * number multiplication */
716  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
717  s->rdft.rdft_calc(&s->rdft, synth_pf);
718  s->rdft.rdft_calc(&s->rdft, coeffs);
719  synth_pf[0] *= coeffs[0];
720  synth_pf[1] *= coeffs[1];
721  for (n = 1; n < 64; n++) {
722  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
723  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
724  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
725  }
726  s->irdft.rdft_calc(&s->irdft, synth_pf);
727  }
728 
729  /* merge filter output with the history of previous runs */
730  if (s->denoise_filter_cache_size) {
731  lim = FFMIN(s->denoise_filter_cache_size, size);
732  for (n = 0; n < lim; n++)
733  synth_pf[n] += s->denoise_filter_cache[n];
734  s->denoise_filter_cache_size -= lim;
735  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
737  }
738 
739  /* move remainder of filter output into a cache for future runs */
740  if (fcb_type != FCB_TYPE_SILENCE) {
741  lim = FFMIN(remainder, s->denoise_filter_cache_size);
742  for (n = 0; n < lim; n++)
743  s->denoise_filter_cache[n] += synth_pf[size + n];
744  if (lim < remainder) {
745  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
746  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
747  s->denoise_filter_cache_size = remainder;
748  }
749  }
750 }
751 
752 /**
753  * Averaging projection filter, the postfilter used in WMAVoice.
754  *
755  * This uses the following steps:
756  * - A zero-synthesis filter (generate excitation from synth signal)
757  * - Kalman smoothing on excitation, based on pitch
758  * - Re-synthesized smoothened output
759  * - Iterative Wiener denoise filter
760  * - Adaptive gain filter
761  * - DC filter
762  *
763  * @param s WMAVoice decoding context
764  * @param synth Speech synthesis output (before postfilter)
765  * @param samples Output buffer for filtered samples
766  * @param size Buffer size of synth & samples
767  * @param lpcs Generated LPCs used for speech synthesis
768  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
769  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
770  * @param pitch Pitch of the input signal
771  */
772 static void postfilter(WMAVoiceContext *s, const float *synth,
773  float *samples, int size,
774  const float *lpcs, float *zero_exc_pf,
775  int fcb_type, int pitch)
776 {
777  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
778  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
779  *synth_filter_in = zero_exc_pf;
780 
781  av_assert0(size <= MAX_FRAMESIZE / 2);
782 
783  /* generate excitation from input signal */
784  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
785 
786  if (fcb_type >= FCB_TYPE_AW_PULSES &&
787  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
788  synth_filter_in = synth_filter_in_buf;
789 
790  /* re-synthesize speech after smoothening, and keep history */
791  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
792  synth_filter_in, size, s->lsps);
793  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
794  sizeof(synth_pf[0]) * s->lsps);
795 
796  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
797 
798  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
799  &s->postfilter_agc);
800 
801  if (s->dc_level > 8) {
802  /* remove ultra-low frequency DC noise / highpass filter;
803  * coefficients are identical to those used in SIPR decoding,
804  * and very closely resemble those used in AMR-NB decoding. */
806  (const float[2]) { -1.99997, 1.0 },
807  (const float[2]) { -1.9330735188, 0.93589198496 },
808  0.93980580475, s->dcf_mem, size);
809  }
810 }
811 /**
812  * @}
813  */
814 
815 /**
816  * Dequantize LSPs
817  * @param lsps output pointer to the array that will hold the LSPs
818  * @param num number of LSPs to be dequantized
819  * @param values quantized values, contains n_stages values
820  * @param sizes range (i.e. max value) of each quantized value
821  * @param n_stages number of dequantization runs
822  * @param table dequantization table to be used
823  * @param mul_q LSF multiplier
824  * @param base_q base (lowest) LSF values
825  */
826 static void dequant_lsps(double *lsps, int num,
827  const uint16_t *values,
828  const uint16_t *sizes,
829  int n_stages, const uint8_t *table,
830  const double *mul_q,
831  const double *base_q)
832 {
833  int n, m;
834 
835  memset(lsps, 0, num * sizeof(*lsps));
836  for (n = 0; n < n_stages; n++) {
837  const uint8_t *t_off = &table[values[n] * num];
838  double base = base_q[n], mul = mul_q[n];
839 
840  for (m = 0; m < num; m++)
841  lsps[m] += base + mul * t_off[m];
842 
843  table += sizes[n] * num;
844  }
845 }
846 
847 /**
848  * @name LSP dequantization routines
849  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
850  * @note we assume enough bits are available, caller should check.
851  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
852  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
853  * @{
854  */
855 /**
856  * Parse 10 independently-coded LSPs.
857  */
858 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
859 {
860  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
861  static const double mul_lsf[4] = {
862  5.2187144800e-3, 1.4626986422e-3,
863  9.6179549166e-4, 1.1325736225e-3
864  };
865  static const double base_lsf[4] = {
866  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
867  M_PI * -3.3486e-2, M_PI * -5.7408e-2
868  };
869  uint16_t v[4];
870 
871  v[0] = get_bits(gb, 8);
872  v[1] = get_bits(gb, 6);
873  v[2] = get_bits(gb, 5);
874  v[3] = get_bits(gb, 5);
875 
876  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
877  mul_lsf, base_lsf);
878 }
879 
880 /**
881  * Parse 10 independently-coded LSPs, and then derive the tables to
882  * generate LSPs for the other frames from them (residual coding).
883  */
885  double *i_lsps, const double *old,
886  double *a1, double *a2, int q_mode)
887 {
888  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
889  static const double mul_lsf[3] = {
890  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
891  };
892  static const double base_lsf[3] = {
893  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
894  };
895  const float (*ipol_tab)[2][10] = q_mode ?
897  uint16_t interpol, v[3];
898  int n;
899 
900  dequant_lsp10i(gb, i_lsps);
901 
902  interpol = get_bits(gb, 5);
903  v[0] = get_bits(gb, 7);
904  v[1] = get_bits(gb, 6);
905  v[2] = get_bits(gb, 6);
906 
907  for (n = 0; n < 10; n++) {
908  double delta = old[n] - i_lsps[n];
909  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
910  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
911  }
912 
913  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
914  mul_lsf, base_lsf);
915 }
916 
917 /**
918  * Parse 16 independently-coded LSPs.
919  */
920 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
921 {
922  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
923  static const double mul_lsf[5] = {
924  3.3439586280e-3, 6.9908173703e-4,
925  3.3216608306e-3, 1.0334960326e-3,
926  3.1899104283e-3
927  };
928  static const double base_lsf[5] = {
929  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
930  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
931  M_PI * -1.29816e-1
932  };
933  uint16_t v[5];
934 
935  v[0] = get_bits(gb, 8);
936  v[1] = get_bits(gb, 6);
937  v[2] = get_bits(gb, 7);
938  v[3] = get_bits(gb, 6);
939  v[4] = get_bits(gb, 7);
940 
941  dequant_lsps( lsps, 5, v, vec_sizes, 2,
942  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
943  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
944  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
945  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
946  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
947 }
948 
949 /**
950  * Parse 16 independently-coded LSPs, and then derive the tables to
951  * generate LSPs for the other frames from them (residual coding).
952  */
954  double *i_lsps, const double *old,
955  double *a1, double *a2, int q_mode)
956 {
957  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
958  static const double mul_lsf[3] = {
959  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
960  };
961  static const double base_lsf[3] = {
962  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
963  };
964  const float (*ipol_tab)[2][16] = q_mode ?
966  uint16_t interpol, v[3];
967  int n;
968 
969  dequant_lsp16i(gb, i_lsps);
970 
971  interpol = get_bits(gb, 5);
972  v[0] = get_bits(gb, 7);
973  v[1] = get_bits(gb, 7);
974  v[2] = get_bits(gb, 7);
975 
976  for (n = 0; n < 16; n++) {
977  double delta = old[n] - i_lsps[n];
978  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
979  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
980  }
981 
982  dequant_lsps( a2, 10, v, vec_sizes, 1,
983  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
984  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
985  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
986  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
987  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
988 }
989 
990 /**
991  * @}
992  * @name Pitch-adaptive window coding functions
993  * The next few functions are for pitch-adaptive window coding.
994  * @{
995  */
996 /**
997  * Parse the offset of the first pitch-adaptive window pulses, and
998  * the distribution of pulses between the two blocks in this frame.
999  * @param s WMA Voice decoding context private data
1000  * @param gb bit I/O context
1001  * @param pitch pitch for each block in this frame
1002  */
1004  const int *pitch)
1005 {
1006  static const int16_t start_offset[94] = {
1007  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1008  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1009  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1010  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1011  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1012  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1013  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1014  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1015  };
1016  int bits, offset;
1017 
1018  /* position of pulse */
1019  s->aw_idx_is_ext = 0;
1020  if ((bits = get_bits(gb, 6)) >= 54) {
1021  s->aw_idx_is_ext = 1;
1022  bits += (bits - 54) * 3 + get_bits(gb, 2);
1023  }
1024 
1025  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1026  * the distribution of the pulses in each block contained in this frame. */
1027  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1028  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1029  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1030  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1031  offset += s->aw_n_pulses[0] * pitch[0];
1032  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1033  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1034 
1035  /* if continuing from a position before the block, reset position to
1036  * start of block (when corrected for the range over which it can be
1037  * spread in aw_pulse_set1()). */
1038  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1039  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1040  s->aw_first_pulse_off[1] -= pitch[1];
1041  if (start_offset[bits] < 0)
1042  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1043  s->aw_first_pulse_off[0] -= pitch[0];
1044  }
1045 }
1046 
1047 /**
1048  * Apply second set of pitch-adaptive window pulses.
1049  * @param s WMA Voice decoding context private data
1050  * @param gb bit I/O context
1051  * @param block_idx block index in frame [0, 1]
1052  * @param fcb structure containing fixed codebook vector info
1053  */
1055  int block_idx, AMRFixed *fcb)
1056 {
1057  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1058  uint16_t *use_mask = use_mask_mem + 2;
1059  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1060  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1061  * of idx are the position of the bit within a particular item in the
1062  * array (0 being the most significant bit, and 15 being the least
1063  * significant bit), and the remainder (>> 4) is the index in the
1064  * use_mask[]-array. This is faster and uses less memory than using a
1065  * 80-byte/80-int array. */
1066  int pulse_off = s->aw_first_pulse_off[block_idx],
1067  pulse_start, n, idx, range, aidx, start_off = 0;
1068 
1069  /* set offset of first pulse to within this block */
1070  if (s->aw_n_pulses[block_idx] > 0)
1071  while (pulse_off + s->aw_pulse_range < 1)
1072  pulse_off += fcb->pitch_lag;
1073 
1074  /* find range per pulse */
1075  if (s->aw_n_pulses[0] > 0) {
1076  if (block_idx == 0) {
1077  range = 32;
1078  } else /* block_idx = 1 */ {
1079  range = 8;
1080  if (s->aw_n_pulses[block_idx] > 0)
1081  pulse_off = s->aw_next_pulse_off_cache;
1082  }
1083  } else
1084  range = 16;
1085  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1086 
1087  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1088  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1089  * we exclude that range from being pulsed again in this function. */
1090  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1091  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1092  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1093  if (s->aw_n_pulses[block_idx] > 0)
1094  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1095  int excl_range = s->aw_pulse_range; // always 16 or 24
1096  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1097  int first_sh = 16 - (idx & 15);
1098  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1099  excl_range -= first_sh;
1100  if (excl_range >= 16) {
1101  *use_mask_ptr++ = 0;
1102  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1103  } else
1104  *use_mask_ptr &= 0xFFFF >> excl_range;
1105  }
1106 
1107  /* find the 'aidx'th offset that is not excluded */
1108  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1109  for (n = 0; n <= aidx; pulse_start++) {
1110  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1111  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1112  if (use_mask[0]) idx = 0x0F;
1113  else if (use_mask[1]) idx = 0x1F;
1114  else if (use_mask[2]) idx = 0x2F;
1115  else if (use_mask[3]) idx = 0x3F;
1116  else if (use_mask[4]) idx = 0x4F;
1117  else return;
1118  idx -= av_log2_16bit(use_mask[idx >> 4]);
1119  }
1120  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1121  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1122  n++;
1123  start_off = idx;
1124  }
1125  }
1126 
1127  fcb->x[fcb->n] = start_off;
1128  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1129  fcb->n++;
1130 
1131  /* set offset for next block, relative to start of that block */
1132  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1133  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1134 }
1135 
1136 /**
1137  * Apply first set of pitch-adaptive window pulses.
1138  * @param s WMA Voice decoding context private data
1139  * @param gb bit I/O context
1140  * @param block_idx block index in frame [0, 1]
1141  * @param fcb storage location for fixed codebook pulse info
1142  */
1144  int block_idx, AMRFixed *fcb)
1145 {
1146  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1147  float v;
1148 
1149  if (s->aw_n_pulses[block_idx] > 0) {
1150  int n, v_mask, i_mask, sh, n_pulses;
1151 
1152  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1153  n_pulses = 3;
1154  v_mask = 8;
1155  i_mask = 7;
1156  sh = 4;
1157  } else { // 4 pulses, 1:sign + 2:index each
1158  n_pulses = 4;
1159  v_mask = 4;
1160  i_mask = 3;
1161  sh = 3;
1162  }
1163 
1164  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1165  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1166  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1167  s->aw_first_pulse_off[block_idx];
1168  while (fcb->x[fcb->n] < 0)
1169  fcb->x[fcb->n] += fcb->pitch_lag;
1170  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1171  fcb->n++;
1172  }
1173  } else {
1174  int num2 = (val & 0x1FF) >> 1, delta, idx;
1175 
1176  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1177  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1178  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1179  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1180  v = (val & 0x200) ? -1.0 : 1.0;
1181 
1182  fcb->no_repeat_mask |= 3 << fcb->n;
1183  fcb->x[fcb->n] = idx - delta;
1184  fcb->y[fcb->n] = v;
1185  fcb->x[fcb->n + 1] = idx;
1186  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1187  fcb->n += 2;
1188  }
1189 }
1190 
1191 /**
1192  * @}
1193  *
1194  * Generate a random number from frame_cntr and block_idx, which will lief
1195  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1196  * table of size 1000 of which you want to read block_size entries).
1197  *
1198  * @param frame_cntr current frame number
1199  * @param block_num current block index
1200  * @param block_size amount of entries we want to read from a table
1201  * that has 1000 entries
1202  * @return a (non-)random number in the [0, 1000 - block_size] range.
1203  */
1204 static int pRNG(int frame_cntr, int block_num, int block_size)
1205 {
1206  /* array to simplify the calculation of z:
1207  * y = (x % 9) * 5 + 6;
1208  * z = (49995 * x) / y;
1209  * Since y only has 9 values, we can remove the division by using a
1210  * LUT and using FASTDIV-style divisions. For each of the 9 values
1211  * of y, we can rewrite z as:
1212  * z = x * (49995 / y) + x * ((49995 % y) / y)
1213  * In this table, each col represents one possible value of y, the
1214  * first number is 49995 / y, and the second is the FASTDIV variant
1215  * of 49995 % y / y. */
1216  static const unsigned int div_tbl[9][2] = {
1217  { 8332, 3 * 715827883U }, // y = 6
1218  { 4545, 0 * 390451573U }, // y = 11
1219  { 3124, 11 * 268435456U }, // y = 16
1220  { 2380, 15 * 204522253U }, // y = 21
1221  { 1922, 23 * 165191050U }, // y = 26
1222  { 1612, 23 * 138547333U }, // y = 31
1223  { 1388, 27 * 119304648U }, // y = 36
1224  { 1219, 16 * 104755300U }, // y = 41
1225  { 1086, 39 * 93368855U } // y = 46
1226  };
1227  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1228  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1229  // so this is effectively a modulo (%)
1230  y = x - 9 * MULH(477218589, x); // x % 9
1231  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1232  // z = x * 49995 / (y * 5 + 6)
1233  return z % (1000 - block_size);
1234 }
1235 
1236 /**
1237  * Parse hardcoded signal for a single block.
1238  * @note see #synth_block().
1239  */
1241  int block_idx, int size,
1242  const struct frame_type_desc *frame_desc,
1243  float *excitation)
1244 {
1245  float gain;
1246  int n, r_idx;
1247 
1248  av_assert0(size <= MAX_FRAMESIZE);
1249 
1250  /* Set the offset from which we start reading wmavoice_std_codebook */
1251  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1252  r_idx = pRNG(s->frame_cntr, block_idx, size);
1253  gain = s->silence_gain;
1254  } else /* FCB_TYPE_HARDCODED */ {
1255  r_idx = get_bits(gb, 8);
1256  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1257  }
1258 
1259  /* Clear gain prediction parameters */
1260  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1261 
1262  /* Apply gain to hardcoded codebook and use that as excitation signal */
1263  for (n = 0; n < size; n++)
1264  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1265 }
1266 
1267 /**
1268  * Parse FCB/ACB signal for a single block.
1269  * @note see #synth_block().
1270  */
1272  int block_idx, int size,
1273  int block_pitch_sh2,
1274  const struct frame_type_desc *frame_desc,
1275  float *excitation)
1276 {
1277  static const float gain_coeff[6] = {
1278  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1279  };
1280  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1281  int n, idx, gain_weight;
1282  AMRFixed fcb;
1283 
1284  av_assert0(size <= MAX_FRAMESIZE / 2);
1285  memset(pulses, 0, sizeof(*pulses) * size);
1286 
1287  fcb.pitch_lag = block_pitch_sh2 >> 2;
1288  fcb.pitch_fac = 1.0;
1289  fcb.no_repeat_mask = 0;
1290  fcb.n = 0;
1291 
1292  /* For the other frame types, this is where we apply the innovation
1293  * (fixed) codebook pulses of the speech signal. */
1294  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1295  aw_pulse_set1(s, gb, block_idx, &fcb);
1296  aw_pulse_set2(s, gb, block_idx, &fcb);
1297  } else /* FCB_TYPE_EXC_PULSES */ {
1298  int offset_nbits = 5 - frame_desc->log_n_blocks;
1299 
1300  fcb.no_repeat_mask = -1;
1301  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1302  * (instead of double) for a subset of pulses */
1303  for (n = 0; n < 5; n++) {
1304  float sign;
1305  int pos1, pos2;
1306 
1307  sign = get_bits1(gb) ? 1.0 : -1.0;
1308  pos1 = get_bits(gb, offset_nbits);
1309  fcb.x[fcb.n] = n + 5 * pos1;
1310  fcb.y[fcb.n++] = sign;
1311  if (n < frame_desc->dbl_pulses) {
1312  pos2 = get_bits(gb, offset_nbits);
1313  fcb.x[fcb.n] = n + 5 * pos2;
1314  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1315  }
1316  }
1317  }
1318  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1319 
1320  /* Calculate gain for adaptive & fixed codebook signal.
1321  * see ff_amr_set_fixed_gain(). */
1322  idx = get_bits(gb, 7);
1323  fcb_gain = expf(ff_scalarproduct_float_c(s->gain_pred_err, gain_coeff, 6) -
1324  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1325  acb_gain = wmavoice_gain_codebook_acb[idx];
1326  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1327  -2.9957322736 /* log(0.05) */,
1328  1.6094379124 /* log(5.0) */);
1329 
1330  gain_weight = 8 >> frame_desc->log_n_blocks;
1331  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1332  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1333  for (n = 0; n < gain_weight; n++)
1334  s->gain_pred_err[n] = pred_err;
1335 
1336  /* Calculation of adaptive codebook */
1337  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1338  int len;
1339  for (n = 0; n < size; n += len) {
1340  int next_idx_sh16;
1341  int abs_idx = block_idx * size + n;
1342  int pitch_sh16 = (s->last_pitch_val << 16) +
1343  s->pitch_diff_sh16 * abs_idx;
1344  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1345  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1346  idx = idx_sh16 >> 16;
1347  if (s->pitch_diff_sh16) {
1348  if (s->pitch_diff_sh16 > 0) {
1349  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1350  } else
1351  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1352  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1353  1, size - n);
1354  } else
1355  len = size;
1356 
1357  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1359  idx, 9, len);
1360  }
1361  } else /* ACB_TYPE_HAMMING */ {
1362  int block_pitch = block_pitch_sh2 >> 2;
1363  idx = block_pitch_sh2 & 3;
1364  if (idx) {
1365  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1367  idx, 8, size);
1368  } else
1369  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1370  sizeof(float) * size);
1371  }
1372 
1373  /* Interpolate ACB/FCB and use as excitation signal */
1374  ff_weighted_vector_sumf(excitation, excitation, pulses,
1375  acb_gain, fcb_gain, size);
1376 }
1377 
1378 /**
1379  * Parse data in a single block.
1380  * @note we assume enough bits are available, caller should check.
1381  *
1382  * @param s WMA Voice decoding context private data
1383  * @param gb bit I/O context
1384  * @param block_idx index of the to-be-read block
1385  * @param size amount of samples to be read in this block
1386  * @param block_pitch_sh2 pitch for this block << 2
1387  * @param lsps LSPs for (the end of) this frame
1388  * @param prev_lsps LSPs for the last frame
1389  * @param frame_desc frame type descriptor
1390  * @param excitation target memory for the ACB+FCB interpolated signal
1391  * @param synth target memory for the speech synthesis filter output
1392  * @return 0 on success, <0 on error.
1393  */
1395  int block_idx, int size,
1396  int block_pitch_sh2,
1397  const double *lsps, const double *prev_lsps,
1398  const struct frame_type_desc *frame_desc,
1399  float *excitation, float *synth)
1400 {
1401  double i_lsps[MAX_LSPS];
1402  float lpcs[MAX_LSPS];
1403  float fac;
1404  int n;
1405 
1406  if (frame_desc->acb_type == ACB_TYPE_NONE)
1407  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1408  else
1409  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1410  frame_desc, excitation);
1411 
1412  /* convert interpolated LSPs to LPCs */
1413  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1414  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1415  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1416  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1417 
1418  /* Speech synthesis */
1419  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1420 }
1421 
1422 /**
1423  * Synthesize output samples for a single frame.
1424  * @note we assume enough bits are available, caller should check.
1425  *
1426  * @param ctx WMA Voice decoder context
1427  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1428  * @param frame_idx Frame number within superframe [0-2]
1429  * @param samples pointer to output sample buffer, has space for at least 160
1430  * samples
1431  * @param lsps LSP array
1432  * @param prev_lsps array of previous frame's LSPs
1433  * @param excitation target buffer for excitation signal
1434  * @param synth target buffer for synthesized speech data
1435  * @return 0 on success, <0 on error.
1436  */
1437 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1438  float *samples,
1439  const double *lsps, const double *prev_lsps,
1440  float *excitation, float *synth)
1441 {
1442  WMAVoiceContext *s = ctx->priv_data;
1443  int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1444  int pitch[MAX_BLOCKS], last_block_pitch;
1445 
1446  /* Parse frame type ("frame header"), see frame_descs */
1447  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1448 
1449  if (bd_idx < 0) {
1450  av_log(ctx, AV_LOG_ERROR,
1451  "Invalid frame type VLC code, skipping\n");
1452  return -1;
1453  }
1454 
1455  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1456 
1457  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1458  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1459  /* Pitch is provided per frame, which is interpreted as the pitch of
1460  * the last sample of the last block of this frame. We can interpolate
1461  * the pitch of other blocks (and even pitch-per-sample) by gradually
1462  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1463  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1464  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1465  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1466  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1467  if (s->last_acb_type == ACB_TYPE_NONE ||
1468  20 * abs(cur_pitch_val - s->last_pitch_val) >
1469  (cur_pitch_val + s->last_pitch_val))
1470  s->last_pitch_val = cur_pitch_val;
1471 
1472  /* pitch per block */
1473  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1474  int fac = n * 2 + 1;
1475 
1476  pitch[n] = (MUL16(fac, cur_pitch_val) +
1477  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1478  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1479  }
1480 
1481  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1482  s->pitch_diff_sh16 =
1483  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1484  }
1485 
1486  /* Global gain (if silence) and pitch-adaptive window coordinates */
1487  switch (frame_descs[bd_idx].fcb_type) {
1488  case FCB_TYPE_SILENCE:
1490  break;
1491  case FCB_TYPE_AW_PULSES:
1492  aw_parse_coords(s, gb, pitch);
1493  break;
1494  }
1495 
1496  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1497  int bl_pitch_sh2;
1498 
1499  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1500  switch (frame_descs[bd_idx].acb_type) {
1501  case ACB_TYPE_HAMMING: {
1502  /* Pitch is given per block. Per-block pitches are encoded as an
1503  * absolute value for the first block, and then delta values
1504  * relative to this value) for all subsequent blocks. The scale of
1505  * this pitch value is semi-logaritmic compared to its use in the
1506  * decoder, so we convert it to normal scale also. */
1507  int block_pitch,
1508  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1509  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1510  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1511 
1512  if (n == 0) {
1513  block_pitch = get_bits(gb, s->block_pitch_nbits);
1514  } else
1515  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1517  /* Convert last_ so that any next delta is within _range */
1518  last_block_pitch = av_clip(block_pitch,
1520  s->block_pitch_range -
1522 
1523  /* Convert semi-log-style scale back to normal scale */
1524  if (block_pitch < t1) {
1525  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1526  } else {
1527  block_pitch -= t1;
1528  if (block_pitch < t2) {
1529  bl_pitch_sh2 =
1530  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1531  } else {
1532  block_pitch -= t2;
1533  if (block_pitch < t3) {
1534  bl_pitch_sh2 =
1535  (s->block_conv_table[2] + block_pitch) << 2;
1536  } else
1537  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1538  }
1539  }
1540  pitch[n] = bl_pitch_sh2 >> 2;
1541  break;
1542  }
1543 
1544  case ACB_TYPE_ASYMMETRIC: {
1545  bl_pitch_sh2 = pitch[n] << 2;
1546  break;
1547  }
1548 
1549  default: // ACB_TYPE_NONE has no pitch
1550  bl_pitch_sh2 = 0;
1551  break;
1552  }
1553 
1554  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1555  lsps, prev_lsps, &frame_descs[bd_idx],
1556  &excitation[n * block_nsamples],
1557  &synth[n * block_nsamples]);
1558  }
1559 
1560  /* Averaging projection filter, if applicable. Else, just copy samples
1561  * from synthesis buffer */
1562  if (s->do_apf) {
1563  double i_lsps[MAX_LSPS];
1564  float lpcs[MAX_LSPS];
1565 
1566  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1567  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1568  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1569  postfilter(s, synth, samples, 80, lpcs,
1570  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1571  frame_descs[bd_idx].fcb_type, pitch[0]);
1572 
1573  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1574  i_lsps[n] = cos(lsps[n]);
1575  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1576  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1577  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1578  frame_descs[bd_idx].fcb_type, pitch[0]);
1579  } else
1580  memcpy(samples, synth, 160 * sizeof(synth[0]));
1581 
1582  /* Cache values for next frame */
1583  s->frame_cntr++;
1584  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1585  s->last_acb_type = frame_descs[bd_idx].acb_type;
1586  switch (frame_descs[bd_idx].acb_type) {
1587  case ACB_TYPE_NONE:
1588  s->last_pitch_val = 0;
1589  break;
1590  case ACB_TYPE_ASYMMETRIC:
1591  s->last_pitch_val = cur_pitch_val;
1592  break;
1593  case ACB_TYPE_HAMMING:
1594  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1595  break;
1596  }
1597 
1598  return 0;
1599 }
1600 
1601 /**
1602  * Ensure minimum value for first item, maximum value for last value,
1603  * proper spacing between each value and proper ordering.
1604  *
1605  * @param lsps array of LSPs
1606  * @param num size of LSP array
1607  *
1608  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1609  * useful to put in a generic location later on. Parts are also
1610  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1611  * which is in float.
1612  */
1613 static void stabilize_lsps(double *lsps, int num)
1614 {
1615  int n, m, l;
1616 
1617  /* set minimum value for first, maximum value for last and minimum
1618  * spacing between LSF values.
1619  * Very similar to ff_set_min_dist_lsf(), but in double. */
1620  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1621  for (n = 1; n < num; n++)
1622  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1623  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1624 
1625  /* reorder (looks like one-time / non-recursed bubblesort).
1626  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1627  for (n = 1; n < num; n++) {
1628  if (lsps[n] < lsps[n - 1]) {
1629  for (m = 1; m < num; m++) {
1630  double tmp = lsps[m];
1631  for (l = m - 1; l >= 0; l--) {
1632  if (lsps[l] <= tmp) break;
1633  lsps[l + 1] = lsps[l];
1634  }
1635  lsps[l + 1] = tmp;
1636  }
1637  break;
1638  }
1639  }
1640 }
1641 
1642 /**
1643  * Test if there's enough bits to read 1 superframe.
1644  *
1645  * @param orig_gb bit I/O context used for reading. This function
1646  * does not modify the state of the bitreader; it
1647  * only uses it to copy the current stream position
1648  * @param s WMA Voice decoding context private data
1649  * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1650  */
1652  WMAVoiceContext *s)
1653 {
1654  GetBitContext s_gb, *gb = &s_gb;
1655  int n, need_bits, bd_idx;
1656  const struct frame_type_desc *frame_desc;
1657 
1658  /* initialize a copy */
1659  init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1660  skip_bits_long(gb, get_bits_count(orig_gb));
1661  av_assert1(get_bits_left(gb) == get_bits_left(orig_gb));
1662 
1663  /* superframe header */
1664  if (get_bits_left(gb) < 14)
1665  return 1;
1666  if (!get_bits1(gb))
1667  return -1; // WMAPro-in-WMAVoice superframe
1668  if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1669  if (s->has_residual_lsps) { // residual LSPs (for all frames)
1670  if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1671  return 1;
1673  }
1674 
1675  /* frames */
1676  for (n = 0; n < MAX_FRAMES; n++) {
1677  int aw_idx_is_ext = 0;
1678 
1679  if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1680  if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1682  }
1683  bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1684  if (bd_idx < 0)
1685  return -1; // invalid frame type VLC code
1686  frame_desc = &frame_descs[bd_idx];
1687  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1688  if (get_bits_left(gb) < s->pitch_nbits)
1689  return 1;
1690  skip_bits_long(gb, s->pitch_nbits);
1691  }
1692  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1693  skip_bits(gb, 8);
1694  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1695  int tmp = get_bits(gb, 6);
1696  if (tmp >= 0x36) {
1697  skip_bits(gb, 2);
1698  aw_idx_is_ext = 1;
1699  }
1700  }
1701 
1702  /* blocks */
1703  if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1704  need_bits = s->block_pitch_nbits +
1705  (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1706  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1707  need_bits = 2 * !aw_idx_is_ext;
1708  } else
1709  need_bits = 0;
1710  need_bits += frame_desc->frame_size;
1711  if (get_bits_left(gb) < need_bits)
1712  return 1;
1713  skip_bits_long(gb, need_bits);
1714  }
1715 
1716  return 0;
1717 }
1718 
1719 /**
1720  * Synthesize output samples for a single superframe. If we have any data
1721  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1722  * in s->gb.
1723  *
1724  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1725  * to give a total of 480 samples per frame. See #synth_frame() for frame
1726  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1727  * (if these are globally specified for all frames (residually); they can
1728  * also be specified individually per-frame. See the s->has_residual_lsps
1729  * option), and can specify the number of samples encoded in this superframe
1730  * (if less than 480), usually used to prevent blanks at track boundaries.
1731  *
1732  * @param ctx WMA Voice decoder context
1733  * @return 0 on success, <0 on error or 1 if there was not enough data to
1734  * fully parse the superframe
1735  */
1736 static int synth_superframe(AVCodecContext *ctx, int *got_frame_ptr)
1737 {
1738  WMAVoiceContext *s = ctx->priv_data;
1739  GetBitContext *gb = &s->gb, s_gb;
1740  int n, res, n_samples = 480;
1741  double lsps[MAX_FRAMES][MAX_LSPS];
1742  const double *mean_lsf = s->lsps == 16 ?
1744  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1745  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1746  float *samples;
1747 
1748  memcpy(synth, s->synth_history,
1749  s->lsps * sizeof(*synth));
1750  memcpy(excitation, s->excitation_history,
1751  s->history_nsamples * sizeof(*excitation));
1752 
1753  if (s->sframe_cache_size > 0) {
1754  gb = &s_gb;
1756  s->sframe_cache_size = 0;
1757  }
1758 
1759  if ((res = check_bits_for_superframe(gb, s)) == 1) {
1760  *got_frame_ptr = 0;
1761  return 1;
1762  }
1763 
1764  /* First bit is speech/music bit, it differentiates between WMAVoice
1765  * speech samples (the actual codec) and WMAVoice music samples, which
1766  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1767  * the wild yet. */
1768  if (!get_bits1(gb)) {
1769  av_log_missing_feature(ctx, "WMAPro-in-WMAVoice", 1);
1770  return AVERROR_PATCHWELCOME;
1771  }
1772 
1773  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1774  if (get_bits1(gb)) {
1775  if ((n_samples = get_bits(gb, 12)) > 480) {
1776  av_log(ctx, AV_LOG_ERROR,
1777  "Superframe encodes >480 samples (%d), not allowed\n",
1778  n_samples);
1779  return -1;
1780  }
1781  }
1782  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1783  if (s->has_residual_lsps) {
1784  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1785 
1786  for (n = 0; n < s->lsps; n++)
1787  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1788 
1789  if (s->lsps == 10) {
1790  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1791  } else /* s->lsps == 16 */
1792  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1793 
1794  for (n = 0; n < s->lsps; n++) {
1795  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1796  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1797  lsps[2][n] += mean_lsf[n];
1798  }
1799  for (n = 0; n < 3; n++)
1800  stabilize_lsps(lsps[n], s->lsps);
1801  }
1802 
1803  /* get output buffer */
1804  s->frame.nb_samples = 480;
1805  if ((res = ff_get_buffer(ctx, &s->frame)) < 0) {
1806  av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
1807  return res;
1808  }
1809  s->frame.nb_samples = n_samples;
1810  samples = (float *)s->frame.data[0];
1811 
1812  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1813  for (n = 0; n < 3; n++) {
1814  if (!s->has_residual_lsps) {
1815  int m;
1816 
1817  if (s->lsps == 10) {
1818  dequant_lsp10i(gb, lsps[n]);
1819  } else /* s->lsps == 16 */
1820  dequant_lsp16i(gb, lsps[n]);
1821 
1822  for (m = 0; m < s->lsps; m++)
1823  lsps[n][m] += mean_lsf[m];
1824  stabilize_lsps(lsps[n], s->lsps);
1825  }
1826 
1827  if ((res = synth_frame(ctx, gb, n,
1828  &samples[n * MAX_FRAMESIZE],
1829  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1830  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1831  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1832  *got_frame_ptr = 0;
1833  return res;
1834  }
1835  }
1836 
1837  /* Statistics? FIXME - we don't check for length, a slight overrun
1838  * will be caught by internal buffer padding, and anything else
1839  * will be skipped, not read. */
1840  if (get_bits1(gb)) {
1841  res = get_bits(gb, 4);
1842  skip_bits(gb, 10 * (res + 1));
1843  }
1844 
1845  *got_frame_ptr = 1;
1846 
1847  /* Update history */
1848  memcpy(s->prev_lsps, lsps[2],
1849  s->lsps * sizeof(*s->prev_lsps));
1850  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1851  s->lsps * sizeof(*synth));
1852  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1853  s->history_nsamples * sizeof(*excitation));
1854  if (s->do_apf)
1855  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1856  s->history_nsamples * sizeof(*s->zero_exc_pf));
1857 
1858  return 0;
1859 }
1860 
1861 /**
1862  * Parse the packet header at the start of each packet (input data to this
1863  * decoder).
1864  *
1865  * @param s WMA Voice decoding context private data
1866  * @return 1 if not enough bits were available, or 0 on success.
1867  */
1869 {
1870  GetBitContext *gb = &s->gb;
1871  unsigned int res;
1872 
1873  if (get_bits_left(gb) < 11)
1874  return 1;
1875  skip_bits(gb, 4); // packet sequence number
1876  s->has_residual_lsps = get_bits1(gb);
1877  do {
1878  res = get_bits(gb, 6); // number of superframes per packet
1879  // (minus first one if there is spillover)
1880  if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1881  return 1;
1882  } while (res == 0x3F);
1884 
1885  return 0;
1886 }
1887 
1888 /**
1889  * Copy (unaligned) bits from gb/data/size to pb.
1890  *
1891  * @param pb target buffer to copy bits into
1892  * @param data source buffer to copy bits from
1893  * @param size size of the source data, in bytes
1894  * @param gb bit I/O context specifying the current position in the source.
1895  * data. This function might use this to align the bit position to
1896  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1897  * source data
1898  * @param nbits the amount of bits to copy from source to target
1899  *
1900  * @note after calling this function, the current position in the input bit
1901  * I/O context is undefined.
1902  */
1903 static void copy_bits(PutBitContext *pb,
1904  const uint8_t *data, int size,
1905  GetBitContext *gb, int nbits)
1906 {
1907  int rmn_bytes, rmn_bits;
1908 
1909  rmn_bits = rmn_bytes = get_bits_left(gb);
1910  if (rmn_bits < nbits)
1911  return;
1912  if (nbits > pb->size_in_bits - put_bits_count(pb))
1913  return;
1914  rmn_bits &= 7; rmn_bytes >>= 3;
1915  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1916  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1917  avpriv_copy_bits(pb, data + size - rmn_bytes,
1918  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1919 }
1920 
1921 /**
1922  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1923  * and we expect that the demuxer / application provides it to us as such
1924  * (else you'll probably get garbage as output). Every packet has a size of
1925  * ctx->block_align bytes, starts with a packet header (see
1926  * #parse_packet_header()), and then a series of superframes. Superframe
1927  * boundaries may exceed packets, i.e. superframes can split data over
1928  * multiple (two) packets.
1929  *
1930  * For more information about frames, see #synth_superframe().
1931  */
1933  int *got_frame_ptr, AVPacket *avpkt)
1934 {
1935  WMAVoiceContext *s = ctx->priv_data;
1936  GetBitContext *gb = &s->gb;
1937  int size, res, pos;
1938 
1939  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1940  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1941  * feeds us ASF packets, which may concatenate multiple "codec" packets
1942  * in a single "muxer" packet, so we artificially emulate that by
1943  * capping the packet size at ctx->block_align. */
1944  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1945  if (!size) {
1946  *got_frame_ptr = 0;
1947  return 0;
1948  }
1949  init_get_bits(&s->gb, avpkt->data, size << 3);
1950 
1951  /* size == ctx->block_align is used to indicate whether we are dealing with
1952  * a new packet or a packet of which we already read the packet header
1953  * previously. */
1954  if (size == ctx->block_align) { // new packet header
1955  if ((res = parse_packet_header(s)) < 0)
1956  return res;
1957 
1958  /* If the packet header specifies a s->spillover_nbits, then we want
1959  * to push out all data of the previous packet (+ spillover) before
1960  * continuing to parse new superframes in the current packet. */
1961  if (s->spillover_nbits > 0) {
1962  if (s->sframe_cache_size > 0) {
1963  int cnt = get_bits_count(gb);
1964  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1965  flush_put_bits(&s->pb);
1967  if ((res = synth_superframe(ctx, got_frame_ptr)) == 0 &&
1968  *got_frame_ptr) {
1969  cnt += s->spillover_nbits;
1970  s->skip_bits_next = cnt & 7;
1971  *(AVFrame *)data = s->frame;
1972  return cnt >> 3;
1973  } else
1974  skip_bits_long (gb, s->spillover_nbits - cnt +
1975  get_bits_count(gb)); // resync
1976  } else
1977  skip_bits_long(gb, s->spillover_nbits); // resync
1978  }
1979  } else if (s->skip_bits_next)
1980  skip_bits(gb, s->skip_bits_next);
1981 
1982  /* Try parsing superframes in current packet */
1983  s->sframe_cache_size = 0;
1984  s->skip_bits_next = 0;
1985  pos = get_bits_left(gb);
1986  if ((res = synth_superframe(ctx, got_frame_ptr)) < 0) {
1987  return res;
1988  } else if (*got_frame_ptr) {
1989  int cnt = get_bits_count(gb);
1990  s->skip_bits_next = cnt & 7;
1991  *(AVFrame *)data = s->frame;
1992  return cnt >> 3;
1993  } else if ((s->sframe_cache_size = pos) > 0) {
1994  /* rewind bit reader to start of last (incomplete) superframe... */
1995  init_get_bits(gb, avpkt->data, size << 3);
1996  skip_bits_long(gb, (size << 3) - pos);
1997  av_assert1(get_bits_left(gb) == pos);
1998 
1999  /* ...and cache it for spillover in next packet */
2001  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
2002  // FIXME bad - just copy bytes as whole and add use the
2003  // skip_bits_next field
2004  }
2005 
2006  return size;
2007 }
2008 
2010 {
2011  WMAVoiceContext *s = ctx->priv_data;
2012 
2013  if (s->do_apf) {
2014  ff_rdft_end(&s->rdft);
2015  ff_rdft_end(&s->irdft);
2016  ff_dct_end(&s->dct);
2017  ff_dct_end(&s->dst);
2018  }
2019 
2020  return 0;
2021 }
2022 
2024 {
2025  WMAVoiceContext *s = ctx->priv_data;
2026  int n;
2027 
2028  s->postfilter_agc = 0;
2029  s->sframe_cache_size = 0;
2030  s->skip_bits_next = 0;
2031  for (n = 0; n < s->lsps; n++)
2032  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2033  memset(s->excitation_history, 0,
2034  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2035  memset(s->synth_history, 0,
2036  sizeof(*s->synth_history) * MAX_LSPS);
2037  memset(s->gain_pred_err, 0,
2038  sizeof(s->gain_pred_err));
2039 
2040  if (s->do_apf) {
2041  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2042  sizeof(*s->synth_filter_out_buf) * s->lsps);
2043  memset(s->dcf_mem, 0,
2044  sizeof(*s->dcf_mem) * 2);
2045  memset(s->zero_exc_pf, 0,
2046  sizeof(*s->zero_exc_pf) * s->history_nsamples);
2047  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2048  }
2049 }
2050 
2052  .name = "wmavoice",
2053  .type = AVMEDIA_TYPE_AUDIO,
2054  .id = AV_CODEC_ID_WMAVOICE,
2055  .priv_data_size = sizeof(WMAVoiceContext),
2059  .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DR1,
2060  .flush = wmavoice_flush,
2061  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2062 };