FFmpeg
vp9dsp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 #include <math.h>
22 #include <string.h>
23 #include "checkasm.h"
24 #include "libavcodec/vp9data.h"
25 #include "libavcodec/vp9.h"
26 #include "libavutil/common.h"
27 #include "libavutil/internal.h"
28 #include "libavutil/intreadwrite.h"
29 #include "libavutil/mathematics.h"
30 
31 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
32 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
33 
34 #define randomize_buffers() \
35  do { \
36  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
37  int k; \
38  for (k = -4; k < SIZEOF_PIXEL * FFMAX(8, size); k += 4) { \
39  uint32_t r = rnd() & mask; \
40  AV_WN32A(a + k, r); \
41  } \
42  for (k = 0; k < size * SIZEOF_PIXEL; k += 4) { \
43  uint32_t r = rnd() & mask; \
44  AV_WN32A(l + k, r); \
45  } \
46  } while (0)
47 
48 static void check_ipred(void)
49 {
50  LOCAL_ALIGNED_32(uint8_t, a_buf, [64 * 2]);
51  uint8_t *a = &a_buf[32 * 2];
52  LOCAL_ALIGNED_32(uint8_t, l, [32 * 2]);
53  LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
54  LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
55  VP9DSPContext dsp;
56  int tx, mode, bit_depth;
58  const uint8_t *left, const uint8_t *top);
59  static const char *const mode_names[N_INTRA_PRED_MODES] = {
60  [VERT_PRED] = "vert",
61  [HOR_PRED] = "hor",
62  [DC_PRED] = "dc",
63  [DIAG_DOWN_LEFT_PRED] = "diag_downleft",
64  [DIAG_DOWN_RIGHT_PRED] = "diag_downright",
65  [VERT_RIGHT_PRED] = "vert_right",
66  [HOR_DOWN_PRED] = "hor_down",
67  [VERT_LEFT_PRED] = "vert_left",
68  [HOR_UP_PRED] = "hor_up",
69  [TM_VP8_PRED] = "tm",
70  [LEFT_DC_PRED] = "dc_left",
71  [TOP_DC_PRED] = "dc_top",
72  [DC_128_PRED] = "dc_128",
73  [DC_127_PRED] = "dc_127",
74  [DC_129_PRED] = "dc_129",
75  };
76 
77  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
78  ff_vp9dsp_init(&dsp, bit_depth, 0);
79  for (tx = 0; tx < 4; tx++) {
80  int size = 4 << tx;
81 
82  for (mode = 0; mode < N_INTRA_PRED_MODES; mode++) {
83  if (check_func(dsp.intra_pred[tx][mode], "vp9_%s_%dx%d_%dbpp",
84  mode_names[mode], size, size, bit_depth)) {
86  call_ref(dst0, size * SIZEOF_PIXEL, l, a);
87  call_new(dst1, size * SIZEOF_PIXEL, l, a);
88  if (memcmp(dst0, dst1, size * size * SIZEOF_PIXEL))
89  fail();
90  bench_new(dst1, size * SIZEOF_PIXEL,l, a);
91  }
92  }
93  }
94  }
95  report("ipred");
96 }
97 
98 #undef randomize_buffers
99 
100 #define randomize_buffers() \
101  do { \
102  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
103  for (y = 0; y < sz; y++) { \
104  for (x = 0; x < sz * SIZEOF_PIXEL; x += 4) { \
105  uint32_t r = rnd() & mask; \
106  AV_WN32A(dst + y * sz * SIZEOF_PIXEL + x, r); \
107  AV_WN32A(src + y * sz * SIZEOF_PIXEL + x, rnd() & mask); \
108  } \
109  for (x = 0; x < sz; x++) { \
110  if (bit_depth == 8) { \
111  coef[y * sz + x] = src[y * sz + x] - dst[y * sz + x]; \
112  } else { \
113  ((int32_t *) coef)[y * sz + x] = \
114  ((uint16_t *) src)[y * sz + x] - \
115  ((uint16_t *) dst)[y * sz + x]; \
116  } \
117  } \
118  } \
119  } while(0)
120 
121 // wht function copied from libvpx
122 static void fwht_1d(double *out, const double *in, int sz)
123 {
124  double t0 = in[0] + in[1];
125  double t3 = in[3] - in[2];
126  double t4 = trunc((t0 - t3) * 0.5);
127  double t1 = t4 - in[1];
128  double t2 = t4 - in[2];
129 
130  out[0] = t0 - t2;
131  out[1] = t2;
132  out[2] = t3 + t1;
133  out[3] = t1;
134 }
135 
136 // standard DCT-II
137 static void fdct_1d(double *out, const double *in, int sz)
138 {
139  int k, n;
140 
141  for (k = 0; k < sz; k++) {
142  out[k] = 0.0;
143  for (n = 0; n < sz; n++)
144  out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (sz * 2.0));
145  }
146  out[0] *= M_SQRT1_2;
147 }
148 
149 // see "Towards jointly optimal spatial prediction and adaptive transform in
150 // video/image coding", by J. Han, A. Saxena, and K. Rose
151 // IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.
152 static void fadst4_1d(double *out, const double *in, int sz)
153 {
154  int k, n;
155 
156  for (k = 0; k < sz; k++) {
157  out[k] = 0.0;
158  for (n = 0; n < sz; n++)
159  out[k] += in[n] * sin(M_PI * (n + 1) * (2 * k + 1) / (sz * 2.0 + 1.0));
160  }
161 }
162 
163 // see "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",
164 // by Jingning Han, Yaowu Xu, and Debargha Mukherjee
165 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41418.pdf
166 static void fadst_1d(double *out, const double *in, int sz)
167 {
168  int k, n;
169 
170  for (k = 0; k < sz; k++) {
171  out[k] = 0.0;
172  for (n = 0; n < sz; n++)
173  out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (sz * 4.0));
174  }
175 }
176 
177 typedef void (*ftx1d_fn)(double *out, const double *in, int sz);
178 static void ftx_2d(double *out, const double *in, enum TxfmMode tx,
179  enum TxfmType txtp, int sz)
180 {
181  static const double scaling_factors[5][4] = {
182  { 4.0, 16.0 * M_SQRT1_2 / 3.0, 16.0 * M_SQRT1_2 / 3.0, 32.0 / 9.0 },
183  { 2.0, 2.0, 2.0, 2.0 },
184  { 1.0, 1.0, 1.0, 1.0 },
185  { 0.25 },
186  { 4.0 }
187  };
188  static const ftx1d_fn ftx1d_tbl[5][4][2] = {
189  {
190  { fdct_1d, fdct_1d },
191  { fadst4_1d, fdct_1d },
192  { fdct_1d, fadst4_1d },
193  { fadst4_1d, fadst4_1d },
194  }, {
195  { fdct_1d, fdct_1d },
196  { fadst_1d, fdct_1d },
197  { fdct_1d, fadst_1d },
198  { fadst_1d, fadst_1d },
199  }, {
200  { fdct_1d, fdct_1d },
201  { fadst_1d, fdct_1d },
202  { fdct_1d, fadst_1d },
203  { fadst_1d, fadst_1d },
204  }, {
205  { fdct_1d, fdct_1d },
206  }, {
207  { fwht_1d, fwht_1d },
208  },
209  };
210  double temp[1024];
211  double scaling_factor = scaling_factors[tx][txtp];
212  int i, j;
213 
214  // cols
215  for (i = 0; i < sz; ++i) {
216  double temp_out[32];
217 
218  ftx1d_tbl[tx][txtp][0](temp_out, &in[i * sz], sz);
219  // scale and transpose
220  for (j = 0; j < sz; ++j)
221  temp[j * sz + i] = temp_out[j] * scaling_factor;
222  }
223 
224  // rows
225  for (i = 0; i < sz; i++)
226  ftx1d_tbl[tx][txtp][1](&out[i * sz], &temp[i * sz], sz);
227 }
228 
229 static void ftx(int16_t *buf, enum TxfmMode tx,
230  enum TxfmType txtp, int sz, int bit_depth)
231 {
232  double ind[1024], outd[1024];
233  int n;
234 
235  emms_c();
236  for (n = 0; n < sz * sz; n++) {
237  if (bit_depth == 8)
238  ind[n] = buf[n];
239  else
240  ind[n] = ((int32_t *) buf)[n];
241  }
242  ftx_2d(outd, ind, tx, txtp, sz);
243  for (n = 0; n < sz * sz; n++) {
244  if (bit_depth == 8)
245  buf[n] = lrint(outd[n]);
246  else
247  ((int32_t *) buf)[n] = lrint(outd[n]);
248  }
249 }
250 
251 static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx,
252  enum TxfmType txtp, int sz, int sub, int bit_depth)
253 {
254  // copy the topleft coefficients such that the return value (being the
255  // coefficient scantable index for the eob token) guarantees that only
256  // the topleft $sub out of $sz (where $sz >= $sub) coefficients in both
257  // dimensions are non-zero. This leads to braching to specific optimized
258  // simd versions (e.g. dc-only) so that we get full asm coverage in this
259  // test
260 
261  int n;
262  const int16_t *scan = ff_vp9_scans[tx][txtp];
263  int eob;
264 
265  for (n = 0; n < sz * sz; n++) {
266  int rc = scan[n], rcx = rc % sz, rcy = rc / sz;
267 
268  // find eob for this sub-idct
269  if (rcx >= sub || rcy >= sub)
270  break;
271 
272  // copy coef
273  if (bit_depth == 8) {
274  out[rc] = in[rc];
275  } else {
276  AV_COPY32(&out[rc * 2], &in[rc * 2]);
277  }
278  }
279 
280  eob = n;
281 
282  for (; n < sz * sz; n++) {
283  int rc = scan[n];
284 
285  // zero
286  if (bit_depth == 8) {
287  out[rc] = 0;
288  } else {
289  AV_ZERO32(&out[rc * 2]);
290  }
291  }
292 
293  return eob;
294 }
295 
296 static int iszero(const int16_t *c, int sz)
297 {
298  int n;
299 
300  for (n = 0; n < sz / sizeof(int16_t); n += 2)
301  if (AV_RN32A(&c[n]))
302  return 0;
303 
304  return 1;
305 }
306 
307 #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8))
308 
309 static void check_itxfm(void)
310 {
311  LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
312  LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
313  LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
314  LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
315  LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
316  LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
317  LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
318  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
319  VP9DSPContext dsp;
320  int y, x, tx, txtp, bit_depth, sub;
321  static const char *const txtp_types[N_TXFM_TYPES] = {
322  [DCT_DCT] = "dct_dct", [DCT_ADST] = "adst_dct",
323  [ADST_DCT] = "dct_adst", [ADST_ADST] = "adst_adst"
324  };
325 
326  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
327  ff_vp9dsp_init(&dsp, bit_depth, 0);
328 
329  for (tx = TX_4X4; tx <= N_TXFM_SIZES /* 4 = lossless */; tx++) {
330  int sz = 4 << (tx & 3);
331  int n_txtps = tx < TX_32X32 ? N_TXFM_TYPES : 1;
332 
333  for (txtp = 0; txtp < n_txtps; txtp++) {
334  // skip testing sub-IDCTs for WHT or ADST since they don't
335  // implement it in any of the SIMD functions. If they do,
336  // consider changing this to ensure we have complete test
337  // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,
338  // since the arm version can distinguish them at that level.
339  for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
340  sub < 4 ? (sub <<= 1) : (sub += 4)) {
341  if (check_func(dsp.itxfm_add[tx][txtp],
342  "vp9_inv_%s_%dx%d_sub%d_add_%d",
343  tx == 4 ? "wht_wht" : txtp_types[txtp],
344  sz, sz, sub, bit_depth)) {
345  int eob;
346 
348  ftx(coef, tx, txtp, sz, bit_depth);
349 
350  if (sub < sz) {
351  eob = copy_subcoefs(subcoef0, coef, tx, txtp,
352  sz, sub, bit_depth);
353  } else {
354  eob = sz * sz;
355  memcpy(subcoef0, coef, sz * sz * SIZEOF_COEF);
356  }
357 
358  memcpy(dst0, dst, sz * sz * SIZEOF_PIXEL);
359  memcpy(dst1, dst, sz * sz * SIZEOF_PIXEL);
360  memcpy(subcoef1, subcoef0, sz * sz * SIZEOF_COEF);
361  call_ref(dst0, sz * SIZEOF_PIXEL, subcoef0, eob);
362  call_new(dst1, sz * SIZEOF_PIXEL, subcoef1, eob);
363  if (memcmp(dst0, dst1, sz * sz * SIZEOF_PIXEL) ||
364  !iszero(subcoef0, sz * sz * SIZEOF_COEF) ||
365  !iszero(subcoef1, sz * sz * SIZEOF_COEF))
366  fail();
367 
368  bench_new(dst, sz * SIZEOF_PIXEL, coef, eob);
369  }
370  }
371  }
372  }
373  }
374  report("itxfm");
375 }
376 
377 #undef randomize_buffers
378 
379 #define setpx(a,b,c) \
380  do { \
381  if (SIZEOF_PIXEL == 1) { \
382  buf0[(a) + (b) * jstride] = av_clip_uint8(c); \
383  } else { \
384  ((uint16_t *)buf0)[(a) + (b) * jstride] = av_clip_uintp2(c, bit_depth); \
385  } \
386  } while (0)
387 
388 // c can be an assignment and must not be put under ()
389 #define setdx(a,b,c,d) setpx(a,b,c-(d)+(rnd()%((d)*2+1)))
390 #define setsx(a,b,c,d) setdx(a,b,c,(d) << (bit_depth - 8))
391 static void randomize_loopfilter_buffers(int bidx, int lineoff, int str,
392  int bit_depth, int dir, const int *E,
393  const int *F, const int *H, const int *I,
394  uint8_t *buf0, uint8_t *buf1)
395 {
396  uint32_t mask = (1 << bit_depth) - 1;
397  int off = dir ? lineoff : lineoff * 16;
398  int istride = dir ? 1 : 16;
399  int jstride = dir ? str : 1;
400  int i, j;
401  for (i = 0; i < 2; i++) /* flat16 */ {
402  int idx = off + i * istride, p0, q0;
403  setpx(idx, 0, q0 = rnd() & mask);
404  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
405  for (j = 1; j < 8; j++) {
406  setsx(idx, -1 - j, p0, F[bidx]);
407  setsx(idx, j, q0, F[bidx]);
408  }
409  }
410  for (i = 2; i < 4; i++) /* flat8 */ {
411  int idx = off + i * istride, p0, q0;
412  setpx(idx, 0, q0 = rnd() & mask);
413  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
414  for (j = 1; j < 4; j++) {
415  setsx(idx, -1 - j, p0, F[bidx]);
416  setsx(idx, j, q0, F[bidx]);
417  }
418  for (j = 4; j < 8; j++) {
419  setpx(idx, -1 - j, rnd() & mask);
420  setpx(idx, j, rnd() & mask);
421  }
422  }
423  for (i = 4; i < 6; i++) /* regular */ {
424  int idx = off + i * istride, p2, p1, p0, q0, q1, q2;
425  setpx(idx, 0, q0 = rnd() & mask);
426  setsx(idx, 1, q1 = q0, I[bidx]);
427  setsx(idx, 2, q2 = q1, I[bidx]);
428  setsx(idx, 3, q2, I[bidx]);
429  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
430  setsx(idx, -2, p1 = p0, I[bidx]);
431  setsx(idx, -3, p2 = p1, I[bidx]);
432  setsx(idx, -4, p2, I[bidx]);
433  for (j = 4; j < 8; j++) {
434  setpx(idx, -1 - j, rnd() & mask);
435  setpx(idx, j, rnd() & mask);
436  }
437  }
438  for (i = 6; i < 8; i++) /* off */ {
439  int idx = off + i * istride;
440  for (j = 0; j < 8; j++) {
441  setpx(idx, -1 - j, rnd() & mask);
442  setpx(idx, j, rnd() & mask);
443  }
444  }
445 }
446 #define randomize_buffers(bidx, lineoff, str) \
447  randomize_loopfilter_buffers(bidx, lineoff, str, bit_depth, dir, \
448  E, F, H, I, buf0, buf1)
449 
450 static void check_loopfilter(void)
451 {
452  LOCAL_ALIGNED_32(uint8_t, base0, [32 + 16 * 16 * 2]);
453  LOCAL_ALIGNED_32(uint8_t, base1, [32 + 16 * 16 * 2]);
454  VP9DSPContext dsp;
455  int dir, wd, wd2, bit_depth;
456  static const char *const dir_name[2] = { "h", "v" };
457  static const int E[2] = { 20, 28 }, I[2] = { 10, 16 };
458  static const int H[2] = { 7, 11 }, F[2] = { 1, 1 };
459  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
460 
461  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
462  ff_vp9dsp_init(&dsp, bit_depth, 0);
463 
464  for (dir = 0; dir < 2; dir++) {
465  int midoff = (dir ? 8 * 8 : 8) * SIZEOF_PIXEL;
466  int midoff_aligned = (dir ? 8 * 8 : 16) * SIZEOF_PIXEL;
467  uint8_t *buf0 = base0 + midoff_aligned;
468  uint8_t *buf1 = base1 + midoff_aligned;
469 
470  for (wd = 0; wd < 3; wd++) {
471  // 4/8/16wd_8px
472  if (check_func(dsp.loop_filter_8[wd][dir],
473  "vp9_loop_filter_%s_%d_8_%dbpp",
474  dir_name[dir], 4 << wd, bit_depth)) {
475  randomize_buffers(0, 0, 8);
476  memcpy(buf1 - midoff, buf0 - midoff,
477  16 * 8 * SIZEOF_PIXEL);
478  call_ref(buf0, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
479  call_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
480  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 8 * SIZEOF_PIXEL))
481  fail();
482  bench_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
483  }
484  }
485 
486  midoff = (dir ? 16 * 8 : 8) * SIZEOF_PIXEL;
487  midoff_aligned = (dir ? 16 * 8 : 16) * SIZEOF_PIXEL;
488 
489  buf0 = base0 + midoff_aligned;
490  buf1 = base1 + midoff_aligned;
491 
492  // 16wd_16px loopfilter
493  if (check_func(dsp.loop_filter_16[dir],
494  "vp9_loop_filter_%s_16_16_%dbpp",
495  dir_name[dir], bit_depth)) {
496  randomize_buffers(0, 0, 16);
497  randomize_buffers(0, 8, 16);
498  memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);
499  call_ref(buf0, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
500  call_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
501  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))
502  fail();
503  bench_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
504  }
505 
506  for (wd = 0; wd < 2; wd++) {
507  for (wd2 = 0; wd2 < 2; wd2++) {
508  // mix2 loopfilter
509  if (check_func(dsp.loop_filter_mix2[wd][wd2][dir],
510  "vp9_loop_filter_mix2_%s_%d%d_16_%dbpp",
511  dir_name[dir], 4 << wd, 4 << wd2, bit_depth)) {
512  randomize_buffers(0, 0, 16);
513  randomize_buffers(1, 8, 16);
514  memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);
515 #define M(a) (((a)[1] << 8) | (a)[0])
516  call_ref(buf0, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
517  call_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
518  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))
519  fail();
520  bench_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
521 #undef M
522  }
523  }
524  }
525  }
526  }
527  report("loopfilter");
528 }
529 
530 #undef setsx
531 #undef setpx
532 #undef setdx
533 #undef randomize_buffers
534 
535 #define DST_BUF_SIZE (size * size * SIZEOF_PIXEL)
536 #define SRC_BUF_STRIDE 72
537 #define SRC_BUF_SIZE ((size + 7) * SRC_BUF_STRIDE * SIZEOF_PIXEL)
538 #define src (buf + 3 * SIZEOF_PIXEL * (SRC_BUF_STRIDE + 1))
539 
540 #define randomize_buffers() \
541  do { \
542  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
543  int k; \
544  for (k = 0; k < SRC_BUF_SIZE; k += 4) { \
545  uint32_t r = rnd() & mask; \
546  AV_WN32A(buf + k, r); \
547  } \
548  if (op == 1) { \
549  for (k = 0; k < DST_BUF_SIZE; k += 4) { \
550  uint32_t r = rnd() & mask; \
551  AV_WN32A(dst0 + k, r); \
552  AV_WN32A(dst1 + k, r); \
553  } \
554  } \
555  } while (0)
556 
557 static void check_mc(void)
558 {
559  LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);
560  LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);
561  LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);
562  VP9DSPContext dsp;
563  int op, hsize, bit_depth, filter, dx, dy;
564  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
565  const uint8_t *ref, ptrdiff_t ref_stride,
566  int h, int mx, int my);
567  static const char *const filter_names[4] = {
568  "8tap_smooth", "8tap_regular", "8tap_sharp", "bilin"
569  };
570  static const char *const subpel_names[2][2] = { { "", "h" }, { "v", "hv" } };
571  static const char *const op_names[2] = { "put", "avg" };
572  char str[256];
573 
574  for (op = 0; op < 2; op++) {
575  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
576  ff_vp9dsp_init(&dsp, bit_depth, 0);
577  for (hsize = 0; hsize < 5; hsize++) {
578  int size = 64 >> hsize;
579 
580  for (filter = 0; filter < 4; filter++) {
581  for (dx = 0; dx < 2; dx++) {
582  for (dy = 0; dy < 2; dy++) {
583  if (dx || dy) {
584  snprintf(str, sizeof(str),
585  "%s_%s_%d%s", op_names[op],
586  filter_names[filter], size,
587  subpel_names[dy][dx]);
588  } else {
589  snprintf(str, sizeof(str),
590  "%s%d", op_names[op], size);
591  }
592  if (check_func(dsp.mc[hsize][filter][op][dx][dy],
593  "vp9_%s_%dbpp", str, bit_depth)) {
594  int mx = dx ? 1 + (rnd() % 14) : 0;
595  int my = dy ? 1 + (rnd() % 14) : 0;
597  call_ref(dst0, size * SIZEOF_PIXEL,
598  src, SRC_BUF_STRIDE * SIZEOF_PIXEL,
599  size, mx, my);
600  call_new(dst1, size * SIZEOF_PIXEL,
601  src, SRC_BUF_STRIDE * SIZEOF_PIXEL,
602  size, mx, my);
603  if (memcmp(dst0, dst1, DST_BUF_SIZE))
604  fail();
605 
606  // simd implementations for each filter of subpel
607  // functions are identical
608  if (filter >= 1 && filter <= 2) continue;
609  // 10/12 bpp for bilin are identical
610  if (bit_depth == 12 && filter == 3) continue;
611 
612  bench_new(dst1, size * SIZEOF_PIXEL,
613  src, SRC_BUF_STRIDE * SIZEOF_PIXEL,
614  size, mx, my);
615  }
616  }
617  }
618  }
619  }
620  }
621  }
622  report("mc");
623 }
624 
626 {
627  check_ipred();
628  check_itxfm();
630  check_mc();
631 }
vp9_mc_func mc[5][N_FILTERS][2][2][2]
Definition: vp9dsp.h:114
Definition: vp9.h:47
#define SRC_BUF_STRIDE
Definition: vp9dsp.c:536
static void check_loopfilter(void)
Definition: vp9dsp.c:450
else temp
Definition: vf_mcdeint.c:256
#define M_SQRT1_2
Definition: mathematics.h:58
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:36
static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx, enum TxfmType txtp, int sz, int sub, int bit_depth)
Definition: vp9dsp.c:251
void(* intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp.h:51
static const uint8_t q1[256]
Definition: twofish.c:96
#define setpx(a, b, c)
Definition: vp9dsp.c:379
#define AV_COPY32(d, s)
Definition: intreadwrite.h:601
#define setsx(a, b, c, d)
Definition: vp9dsp.c:390
#define report
Definition: checkasm.h:123
#define SIZEOF_COEF
Definition: vp9dsp.c:307
#define AV_RN32A(p)
Definition: intreadwrite.h:526
#define src
Definition: vp9dsp.c:538
The exact code depends on how similar the blocks are and how related they are to the block
uint8_t
static void randomize_loopfilter_buffers(int bidx, int lineoff, int str, int bit_depth, int dir, const int *E, const int *F, const int *H, const int *I, uint8_t *buf0, uint8_t *buf1)
Definition: vp9dsp.c:391
TxfmType
Definition: vp9.h:37
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
static void fadst4_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:152
Definition: vp9.h:46
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:32
#define t0
Definition: regdef.h:28
static void ftx(int16_t *buf, enum TxfmMode tx, enum TxfmType txtp, int sz, int bit_depth)
Definition: vp9dsp.c:229
#define DST_BUF_SIZE
Definition: vp9dsp.c:535
const int16_t *const ff_vp9_scans[5][4]
Definition: vp9data.c:600
ptrdiff_t size
Definition: opengl_enc.c:100
Definition: vp9.h:38
static void fwht_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:122
av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
Definition: vp9dsp.c:84
Definition: vp9.h:28
static int iszero(const int16_t *c, int sz)
Definition: vp9dsp.c:296
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
static const uint16_t mask[17]
Definition: lzw.c:38
Definition: vp9.h:39
Definition: vp9.h:41
#define t1
Definition: regdef.h:29
static void fadst_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:166
TxfmMode
Definition: vp9.h:27
#define t3
Definition: regdef.h:31
#define fail()
Definition: checkasm.h:120
static const uint8_t q0[256]
Definition: twofish.c:77
common internal API header
#define E
Definition: avdct.c:32
typedef void(APIENTRY *FF_PFNGLACTIVETEXTUREPROC)(GLenum texture)
static av_always_inline av_const double trunc(double x)
Definition: libm.h:458
static void bit_depth(AudioStatsContext *s, uint64_t mask, uint64_t imask, AVRational *depth)
Definition: af_astats.c:226
void(* loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:88
int32_t
void(* ftx1d_fn)(double *out, const double *in, int sz)
Definition: vp9dsp.c:177
int n
Definition: avisynth_c.h:760
#define SIZEOF_PIXEL
Definition: vp9dsp.c:32
#define declare_func_emms(cpu_flags, ret,...)
Definition: checkasm.h:117
#define call_ref(...)
Definition: checkasm.h:126
static void check_mc(void)
Definition: vp9dsp.c:557
void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:102
void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:80
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:31
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:206
void * buf
Definition: avisynth_c.h:766
static const uint32_t pixel_mask[3]
Definition: vp9dsp.c:31
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
#define check_func(func,...)
Definition: checkasm.h:111
static void ftx_2d(double *out, const double *in, enum TxfmMode tx, enum TxfmType txtp, int sz)
Definition: vp9dsp.c:178
#define snprintf
Definition: snprintf.h:34
#define M(a)
Definition: vp9.h:48
#define LOCAL_ALIGNED_32(t, v,...)
Definition: internal.h:137
#define randomize_buffers()
Definition: vp9dsp.c:540
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:78
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:104
void(* itxfm_add[N_TXFM_SIZES+1][N_TXFM_TYPES])(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)
Definition: vp9dsp.h:70
static void fdct_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:137
common internal and external API header
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
#define rnd()
Definition: checkasm.h:104
void checkasm_check_vp9dsp(void)
Definition: vp9dsp.c:625
static void check_ipred(void)
Definition: vp9dsp.c:48
#define t4
Definition: regdef.h:32
static void check_itxfm(void)
Definition: vp9dsp.c:309
#define F(x)
#define bench_new(...)
Definition: checkasm.h:253
#define H
Definition: pixlet.c:39
#define AV_ZERO32(d)
Definition: intreadwrite.h:629
#define lrint
Definition: tablegen.h:53
Definition: vp9.h:31
FILE * out
Definition: movenc.c:54
#define call_new(...)
Definition: checkasm.h:193
#define M_PI
Definition: mathematics.h:52
Definition: vp9.h:40
mode
Use these values in ebur128_init (or&#39;ed).
Definition: ebur128.h:83
#define t2
Definition: regdef.h:30