FFmpeg
vp9dsp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 #include <math.h>
22 #include <string.h>
23 #include "checkasm.h"
24 #include "libavcodec/vp9data.h"
25 #include "libavcodec/vp9.h"
26 #include "libavutil/common.h"
27 #include "libavutil/internal.h"
28 #include "libavutil/intreadwrite.h"
29 #include "libavutil/mathematics.h"
30 #include "libavutil/mem_internal.h"
31 
32 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
33 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
34 
35 #define randomize_buffers() \
36  do { \
37  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
38  int k; \
39  for (k = -4; k < SIZEOF_PIXEL * FFMAX(8, size); k += 4) { \
40  uint32_t r = rnd() & mask; \
41  AV_WN32A(a + k, r); \
42  } \
43  for (k = 0; k < size * SIZEOF_PIXEL; k += 4) { \
44  uint32_t r = rnd() & mask; \
45  AV_WN32A(l + k, r); \
46  } \
47  } while (0)
48 
49 static void check_ipred(void)
50 {
51  LOCAL_ALIGNED_32(uint8_t, a_buf, [64 * 2]);
52  uint8_t *a = &a_buf[32 * 2];
53  LOCAL_ALIGNED_32(uint8_t, l, [32 * 2]);
54  LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
55  LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
56  VP9DSPContext dsp;
57  int tx, mode, bit_depth;
58  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride,
59  const uint8_t *left, const uint8_t *top);
60  static const char *const mode_names[N_INTRA_PRED_MODES] = {
61  [VERT_PRED] = "vert",
62  [HOR_PRED] = "hor",
63  [DC_PRED] = "dc",
64  [DIAG_DOWN_LEFT_PRED] = "diag_downleft",
65  [DIAG_DOWN_RIGHT_PRED] = "diag_downright",
66  [VERT_RIGHT_PRED] = "vert_right",
67  [HOR_DOWN_PRED] = "hor_down",
68  [VERT_LEFT_PRED] = "vert_left",
69  [HOR_UP_PRED] = "hor_up",
70  [TM_VP8_PRED] = "tm",
71  [LEFT_DC_PRED] = "dc_left",
72  [TOP_DC_PRED] = "dc_top",
73  [DC_128_PRED] = "dc_128",
74  [DC_127_PRED] = "dc_127",
75  [DC_129_PRED] = "dc_129",
76  };
77 
78  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
79  ff_vp9dsp_init(&dsp, bit_depth, 0);
80  for (tx = 0; tx < 4; tx++) {
81  int size = 4 << tx;
82 
83  for (mode = 0; mode < N_INTRA_PRED_MODES; mode++) {
84  if (check_func(dsp.intra_pred[tx][mode], "vp9_%s_%dx%d_%dbpp",
85  mode_names[mode], size, size, bit_depth)) {
87  call_ref(dst0, size * SIZEOF_PIXEL, l, a);
88  call_new(dst1, size * SIZEOF_PIXEL, l, a);
89  if (memcmp(dst0, dst1, size * size * SIZEOF_PIXEL))
90  fail();
91  bench_new(dst1, size * SIZEOF_PIXEL,l, a);
92  }
93  }
94  }
95  }
96  report("ipred");
97 }
98 
99 #undef randomize_buffers
100 
101 #define randomize_buffers() \
102  do { \
103  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
104  for (y = 0; y < sz; y++) { \
105  for (x = 0; x < sz * SIZEOF_PIXEL; x += 4) { \
106  uint32_t r = rnd() & mask; \
107  AV_WN32A(dst + y * sz * SIZEOF_PIXEL + x, r); \
108  AV_WN32A(src + y * sz * SIZEOF_PIXEL + x, rnd() & mask); \
109  } \
110  for (x = 0; x < sz; x++) { \
111  if (bit_depth == 8) { \
112  coef[y * sz + x] = src[y * sz + x] - dst[y * sz + x]; \
113  } else { \
114  ((int32_t *) coef)[y * sz + x] = \
115  ((uint16_t *) src)[y * sz + x] - \
116  ((uint16_t *) dst)[y * sz + x]; \
117  } \
118  } \
119  } \
120  } while(0)
121 
122 // wht function copied from libvpx
123 static void fwht_1d(double *out, const double *in, int sz)
124 {
125  double t0 = in[0] + in[1];
126  double t3 = in[3] - in[2];
127  double t4 = trunc((t0 - t3) * 0.5);
128  double t1 = t4 - in[1];
129  double t2 = t4 - in[2];
130 
131  out[0] = t0 - t2;
132  out[1] = t2;
133  out[2] = t3 + t1;
134  out[3] = t1;
135 }
136 
137 // standard DCT-II
138 static void fdct_1d(double *out, const double *in, int sz)
139 {
140  int k, n;
141 
142  for (k = 0; k < sz; k++) {
143  out[k] = 0.0;
144  for (n = 0; n < sz; n++)
145  out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (sz * 2.0));
146  }
147  out[0] *= M_SQRT1_2;
148 }
149 
150 // see "Towards jointly optimal spatial prediction and adaptive transform in
151 // video/image coding", by J. Han, A. Saxena, and K. Rose
152 // IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.
153 static void fadst4_1d(double *out, const double *in, int sz)
154 {
155  int k, n;
156 
157  for (k = 0; k < sz; k++) {
158  out[k] = 0.0;
159  for (n = 0; n < sz; n++)
160  out[k] += in[n] * sin(M_PI * (n + 1) * (2 * k + 1) / (sz * 2.0 + 1.0));
161  }
162 }
163 
164 // see "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",
165 // by Jingning Han, Yaowu Xu, and Debargha Mukherjee
166 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41418.pdf
167 static void fadst_1d(double *out, const double *in, int sz)
168 {
169  int k, n;
170 
171  for (k = 0; k < sz; k++) {
172  out[k] = 0.0;
173  for (n = 0; n < sz; n++)
174  out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (sz * 4.0));
175  }
176 }
177 
178 typedef void (*ftx1d_fn)(double *out, const double *in, int sz);
179 static void ftx_2d(double *out, const double *in, enum TxfmMode tx,
180  enum TxfmType txtp, int sz)
181 {
182  static const double scaling_factors[5][4] = {
183  { 4.0, 16.0 * M_SQRT1_2 / 3.0, 16.0 * M_SQRT1_2 / 3.0, 32.0 / 9.0 },
184  { 2.0, 2.0, 2.0, 2.0 },
185  { 1.0, 1.0, 1.0, 1.0 },
186  { 0.25 },
187  { 4.0 }
188  };
189  static const ftx1d_fn ftx1d_tbl[5][4][2] = {
190  {
191  { fdct_1d, fdct_1d },
192  { fadst4_1d, fdct_1d },
193  { fdct_1d, fadst4_1d },
194  { fadst4_1d, fadst4_1d },
195  }, {
196  { fdct_1d, fdct_1d },
197  { fadst_1d, fdct_1d },
198  { fdct_1d, fadst_1d },
199  { fadst_1d, fadst_1d },
200  }, {
201  { fdct_1d, fdct_1d },
202  { fadst_1d, fdct_1d },
203  { fdct_1d, fadst_1d },
204  { fadst_1d, fadst_1d },
205  }, {
206  { fdct_1d, fdct_1d },
207  }, {
208  { fwht_1d, fwht_1d },
209  },
210  };
211  double temp[1024];
212  double scaling_factor = scaling_factors[tx][txtp];
213  int i, j;
214 
215  // cols
216  for (i = 0; i < sz; ++i) {
217  double temp_out[32];
218 
219  ftx1d_tbl[tx][txtp][0](temp_out, &in[i * sz], sz);
220  // scale and transpose
221  for (j = 0; j < sz; ++j)
222  temp[j * sz + i] = temp_out[j] * scaling_factor;
223  }
224 
225  // rows
226  for (i = 0; i < sz; i++)
227  ftx1d_tbl[tx][txtp][1](&out[i * sz], &temp[i * sz], sz);
228 }
229 
230 static void ftx(int16_t *buf, enum TxfmMode tx,
231  enum TxfmType txtp, int sz, int bit_depth)
232 {
233  double ind[1024], outd[1024];
234  int n;
235 
236  emms_c();
237  for (n = 0; n < sz * sz; n++) {
238  if (bit_depth == 8)
239  ind[n] = buf[n];
240  else
241  ind[n] = ((int32_t *) buf)[n];
242  }
243  ftx_2d(outd, ind, tx, txtp, sz);
244  for (n = 0; n < sz * sz; n++) {
245  if (bit_depth == 8)
246  buf[n] = lrint(outd[n]);
247  else
248  ((int32_t *) buf)[n] = lrint(outd[n]);
249  }
250 }
251 
252 static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx,
253  enum TxfmType txtp, int sz, int sub, int bit_depth)
254 {
255  // copy the topleft coefficients such that the return value (being the
256  // coefficient scantable index for the eob token) guarantees that only
257  // the topleft $sub out of $sz (where $sz >= $sub) coefficients in both
258  // dimensions are non-zero. This leads to braching to specific optimized
259  // simd versions (e.g. dc-only) so that we get full asm coverage in this
260  // test
261 
262  int n;
263  const int16_t *scan = ff_vp9_scans[tx][txtp];
264  int eob;
265 
266  for (n = 0; n < sz * sz; n++) {
267  int rc = scan[n], rcx = rc % sz, rcy = rc / sz;
268 
269  // find eob for this sub-idct
270  if (rcx >= sub || rcy >= sub)
271  break;
272 
273  // copy coef
274  if (bit_depth == 8) {
275  out[rc] = in[rc];
276  } else {
277  AV_COPY32(&out[rc * 2], &in[rc * 2]);
278  }
279  }
280 
281  eob = n;
282 
283  for (; n < sz * sz; n++) {
284  int rc = scan[n];
285 
286  // zero
287  if (bit_depth == 8) {
288  out[rc] = 0;
289  } else {
290  AV_ZERO32(&out[rc * 2]);
291  }
292  }
293 
294  return eob;
295 }
296 
297 static int is_zero(const int16_t *c, int sz)
298 {
299  int n;
300 
301  for (n = 0; n < sz / sizeof(int16_t); n += 2)
302  if (AV_RN32A(&c[n]))
303  return 0;
304 
305  return 1;
306 }
307 
308 #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8))
309 
310 static void check_itxfm(void)
311 {
312  LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
313  LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
314  LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
315  LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
316  LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
317  LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
318  LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
319  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
320  VP9DSPContext dsp;
321  int y, x, tx, txtp, bit_depth, sub;
322  static const char *const txtp_types[N_TXFM_TYPES] = {
323  [DCT_DCT] = "dct_dct", [DCT_ADST] = "adst_dct",
324  [ADST_DCT] = "dct_adst", [ADST_ADST] = "adst_adst"
325  };
326 
327  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
328  ff_vp9dsp_init(&dsp, bit_depth, 0);
329 
330  for (tx = TX_4X4; tx <= N_TXFM_SIZES /* 4 = lossless */; tx++) {
331  int sz = 4 << (tx & 3);
332  int n_txtps = tx < TX_32X32 ? N_TXFM_TYPES : 1;
333 
334  for (txtp = 0; txtp < n_txtps; txtp++) {
335  // skip testing sub-IDCTs for WHT or ADST since they don't
336  // implement it in any of the SIMD functions. If they do,
337  // consider changing this to ensure we have complete test
338  // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,
339  // since the arm version can distinguish them at that level.
340  for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
341  sub < 4 ? (sub <<= 1) : (sub += 4)) {
342  if (check_func(dsp.itxfm_add[tx][txtp],
343  "vp9_inv_%s_%dx%d_sub%d_add_%d",
344  tx == 4 ? "wht_wht" : txtp_types[txtp],
345  sz, sz, sub, bit_depth)) {
346  int eob;
347 
349  ftx(coef, tx, txtp, sz, bit_depth);
350 
351  if (sub < sz) {
352  eob = copy_subcoefs(subcoef0, coef, tx, txtp,
353  sz, sub, bit_depth);
354  } else {
355  eob = sz * sz;
356  memcpy(subcoef0, coef, sz * sz * SIZEOF_COEF);
357  }
358 
359  memcpy(dst0, dst, sz * sz * SIZEOF_PIXEL);
360  memcpy(dst1, dst, sz * sz * SIZEOF_PIXEL);
361  memcpy(subcoef1, subcoef0, sz * sz * SIZEOF_COEF);
362  call_ref(dst0, sz * SIZEOF_PIXEL, subcoef0, eob);
363  call_new(dst1, sz * SIZEOF_PIXEL, subcoef1, eob);
364  if (memcmp(dst0, dst1, sz * sz * SIZEOF_PIXEL) ||
365  !is_zero(subcoef0, sz * sz * SIZEOF_COEF) ||
366  !is_zero(subcoef1, sz * sz * SIZEOF_COEF))
367  fail();
368 
369  bench_new(dst, sz * SIZEOF_PIXEL, coef, eob);
370  }
371  }
372  }
373  }
374  }
375  report("itxfm");
376 }
377 
378 #undef randomize_buffers
379 
380 #define setpx(a,b,c) \
381  do { \
382  if (SIZEOF_PIXEL == 1) { \
383  buf0[(a) + (b) * jstride] = av_clip_uint8(c); \
384  } else { \
385  ((uint16_t *)buf0)[(a) + (b) * jstride] = av_clip_uintp2(c, bit_depth); \
386  } \
387  } while (0)
388 
389 // c can be an assignment and must not be put under ()
390 #define setdx(a,b,c,d) setpx(a,b,c-(d)+(rnd()%((d)*2+1)))
391 #define setsx(a,b,c,d) setdx(a,b,c,(d) << (bit_depth - 8))
392 static void randomize_loopfilter_buffers(int bidx, int lineoff, int str,
393  int bit_depth, int dir, const int *E,
394  const int *F, const int *H, const int *I,
395  uint8_t *buf0, uint8_t *buf1)
396 {
397  uint32_t mask = (1 << bit_depth) - 1;
398  int off = dir ? lineoff : lineoff * 16;
399  int istride = dir ? 1 : 16;
400  int jstride = dir ? str : 1;
401  int i, j;
402  for (i = 0; i < 2; i++) /* flat16 */ {
403  int idx = off + i * istride, p0, q0;
404  setpx(idx, 0, q0 = rnd() & mask);
405  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
406  for (j = 1; j < 8; j++) {
407  setsx(idx, -1 - j, p0, F[bidx]);
408  setsx(idx, j, q0, F[bidx]);
409  }
410  }
411  for (i = 2; i < 4; i++) /* flat8 */ {
412  int idx = off + i * istride, p0, q0;
413  setpx(idx, 0, q0 = rnd() & mask);
414  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
415  for (j = 1; j < 4; j++) {
416  setsx(idx, -1 - j, p0, F[bidx]);
417  setsx(idx, j, q0, F[bidx]);
418  }
419  for (j = 4; j < 8; j++) {
420  setpx(idx, -1 - j, rnd() & mask);
421  setpx(idx, j, rnd() & mask);
422  }
423  }
424  for (i = 4; i < 6; i++) /* regular */ {
425  int idx = off + i * istride, p2, p1, p0, q0, q1, q2;
426  setpx(idx, 0, q0 = rnd() & mask);
427  setsx(idx, 1, q1 = q0, I[bidx]);
428  setsx(idx, 2, q2 = q1, I[bidx]);
429  setsx(idx, 3, q2, I[bidx]);
430  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
431  setsx(idx, -2, p1 = p0, I[bidx]);
432  setsx(idx, -3, p2 = p1, I[bidx]);
433  setsx(idx, -4, p2, I[bidx]);
434  for (j = 4; j < 8; j++) {
435  setpx(idx, -1 - j, rnd() & mask);
436  setpx(idx, j, rnd() & mask);
437  }
438  }
439  for (i = 6; i < 8; i++) /* off */ {
440  int idx = off + i * istride;
441  for (j = 0; j < 8; j++) {
442  setpx(idx, -1 - j, rnd() & mask);
443  setpx(idx, j, rnd() & mask);
444  }
445  }
446 }
447 #define randomize_buffers(bidx, lineoff, str) \
448  randomize_loopfilter_buffers(bidx, lineoff, str, bit_depth, dir, \
449  E, F, H, I, buf0, buf1)
450 
451 static void check_loopfilter(void)
452 {
453  LOCAL_ALIGNED_32(uint8_t, base0, [32 + 16 * 16 * 2]);
454  LOCAL_ALIGNED_32(uint8_t, base1, [32 + 16 * 16 * 2]);
455  VP9DSPContext dsp;
456  int dir, wd, wd2, bit_depth;
457  static const char *const dir_name[2] = { "h", "v" };
458  static const int E[2] = { 20, 28 }, I[2] = { 10, 16 };
459  static const int H[2] = { 7, 11 }, F[2] = { 1, 1 };
460  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
461 
462  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
463  ff_vp9dsp_init(&dsp, bit_depth, 0);
464 
465  for (dir = 0; dir < 2; dir++) {
466  int midoff = (dir ? 8 * 8 : 8) * SIZEOF_PIXEL;
467  int midoff_aligned = (dir ? 8 * 8 : 16) * SIZEOF_PIXEL;
468  uint8_t *buf0 = base0 + midoff_aligned;
469  uint8_t *buf1 = base1 + midoff_aligned;
470 
471  for (wd = 0; wd < 3; wd++) {
472  // 4/8/16wd_8px
473  if (check_func(dsp.loop_filter_8[wd][dir],
474  "vp9_loop_filter_%s_%d_8_%dbpp",
475  dir_name[dir], 4 << wd, bit_depth)) {
476  randomize_buffers(0, 0, 8);
477  memcpy(buf1 - midoff, buf0 - midoff,
478  16 * 8 * SIZEOF_PIXEL);
479  call_ref(buf0, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
480  call_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
481  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 8 * SIZEOF_PIXEL))
482  fail();
483  bench_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
484  }
485  }
486 
487  midoff = (dir ? 16 * 8 : 8) * SIZEOF_PIXEL;
488  midoff_aligned = (dir ? 16 * 8 : 16) * SIZEOF_PIXEL;
489 
490  buf0 = base0 + midoff_aligned;
491  buf1 = base1 + midoff_aligned;
492 
493  // 16wd_16px loopfilter
494  if (check_func(dsp.loop_filter_16[dir],
495  "vp9_loop_filter_%s_16_16_%dbpp",
496  dir_name[dir], bit_depth)) {
497  randomize_buffers(0, 0, 16);
498  randomize_buffers(0, 8, 16);
499  memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);
500  call_ref(buf0, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
501  call_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
502  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))
503  fail();
504  bench_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
505  }
506 
507  for (wd = 0; wd < 2; wd++) {
508  for (wd2 = 0; wd2 < 2; wd2++) {
509  // mix2 loopfilter
510  if (check_func(dsp.loop_filter_mix2[wd][wd2][dir],
511  "vp9_loop_filter_mix2_%s_%d%d_16_%dbpp",
512  dir_name[dir], 4 << wd, 4 << wd2, bit_depth)) {
513  randomize_buffers(0, 0, 16);
514  randomize_buffers(1, 8, 16);
515  memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);
516 #define M(a) (((a)[1] << 8) | (a)[0])
517  call_ref(buf0, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
518  call_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
519  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))
520  fail();
521  bench_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
522 #undef M
523  }
524  }
525  }
526  }
527  }
528  report("loopfilter");
529 }
530 
531 #undef setsx
532 #undef setpx
533 #undef setdx
534 #undef randomize_buffers
535 
536 #define DST_BUF_SIZE (size * size * SIZEOF_PIXEL)
537 #define SRC_BUF_STRIDE 72
538 #define SRC_BUF_SIZE ((size + 7) * SRC_BUF_STRIDE * SIZEOF_PIXEL)
539 #define src (buf + 3 * SIZEOF_PIXEL * (SRC_BUF_STRIDE + 1))
540 
541 #define randomize_buffers() \
542  do { \
543  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
544  int k; \
545  for (k = 0; k < SRC_BUF_SIZE; k += 4) { \
546  uint32_t r = rnd() & mask; \
547  AV_WN32A(buf + k, r); \
548  } \
549  if (op == 1) { \
550  for (k = 0; k < DST_BUF_SIZE; k += 4) { \
551  uint32_t r = rnd() & mask; \
552  AV_WN32A(dst0 + k, r); \
553  AV_WN32A(dst1 + k, r); \
554  } \
555  } \
556  } while (0)
557 
558 static void check_mc(void)
559 {
560  LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);
561  LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);
562  LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);
563  VP9DSPContext dsp;
564  int op, hsize, bit_depth, filter, dx, dy;
565  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
566  const uint8_t *ref, ptrdiff_t ref_stride,
567  int h, int mx, int my);
568  static const char *const filter_names[4] = {
569  "8tap_smooth", "8tap_regular", "8tap_sharp", "bilin"
570  };
571  static const char *const subpel_names[2][2] = { { "", "h" }, { "v", "hv" } };
572  static const char *const op_names[2] = { "put", "avg" };
573  char str[256];
574 
575  for (op = 0; op < 2; op++) {
576  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
577  ff_vp9dsp_init(&dsp, bit_depth, 0);
578  for (hsize = 0; hsize < 5; hsize++) {
579  int size = 64 >> hsize;
580 
581  for (filter = 0; filter < 4; filter++) {
582  for (dx = 0; dx < 2; dx++) {
583  for (dy = 0; dy < 2; dy++) {
584  if (dx || dy) {
585  snprintf(str, sizeof(str),
586  "%s_%s_%d%s", op_names[op],
587  filter_names[filter], size,
588  subpel_names[dy][dx]);
589  } else {
590  snprintf(str, sizeof(str),
591  "%s%d", op_names[op], size);
592  }
593  if (check_func(dsp.mc[hsize][filter][op][dx][dy],
594  "vp9_%s_%dbpp", str, bit_depth)) {
595  int mx = dx ? 1 + (rnd() % 14) : 0;
596  int my = dy ? 1 + (rnd() % 14) : 0;
598  call_ref(dst0, size * SIZEOF_PIXEL,
600  size, mx, my);
601  call_new(dst1, size * SIZEOF_PIXEL,
603  size, mx, my);
604  if (memcmp(dst0, dst1, DST_BUF_SIZE))
605  fail();
606 
607  // simd implementations for each filter of subpel
608  // functions are identical
609  if (filter >= 1 && filter <= 2) continue;
610  // 10/12 bpp for bilin are identical
611  if (bit_depth == 12 && filter == 3) continue;
612 
613  bench_new(dst1, size * SIZEOF_PIXEL,
615  size, mx, my);
616  }
617  }
618  }
619  }
620  }
621  }
622  }
623  report("mc");
624 }
625 
627 {
628  check_ipred();
629  check_itxfm();
631  check_mc();
632 }
declare_func_emms
#define declare_func_emms(cpu_flags, ret,...)
Definition: checkasm.h:128
bit_depth
static void bit_depth(AudioStatsContext *s, uint64_t mask, uint64_t imask, AVRational *depth)
Definition: af_astats.c:226
q1
static const uint8_t q1[256]
Definition: twofish.c:100
fwht_1d
static void fwht_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:123
setpx
#define setpx(a, b, c)
Definition: vp9dsp.c:380
mem_internal.h
DC_128_PRED
@ DC_128_PRED
Definition: vp9.h:58
out
FILE * out
Definition: movenc.c:54
N_TXFM_TYPES
@ N_TXFM_TYPES
Definition: vp9.h:42
sub
static float sub(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:31
VP9DSPContext::loop_filter_8
void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:80
TM_VP8_PRED
@ TM_VP8_PRED
Definition: vp9.h:55
DC_PRED
@ DC_PRED
Definition: vp9.h:48
VP9DSPContext
Definition: vp9dsp.h:39
check_func
#define check_func(func,...)
Definition: checkasm.h:122
t0
#define t0
Definition: regdef.h:28
randomize_loopfilter_buffers
static void randomize_loopfilter_buffers(int bidx, int lineoff, int str, int bit_depth, int dir, const int *E, const int *F, const int *H, const int *I, uint8_t *buf0, uint8_t *buf1)
Definition: vp9dsp.c:392
VERT_LEFT_PRED
@ VERT_LEFT_PRED
Definition: vp9.h:53
fadst4_1d
static void fadst4_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:153
F
#define F(x)
t1
#define t1
Definition: regdef.h:29
mathematics.h
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
call_ref
#define call_ref(...)
Definition: checkasm.h:137
check_itxfm
static void check_itxfm(void)
Definition: vp9dsp.c:310
N_TXFM_SIZES
@ N_TXFM_SIZES
Definition: vp9.h:32
DC_127_PRED
@ DC_127_PRED
Definition: vp9.h:59
VP9DSPContext::loop_filter_mix2
void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:102
fail
#define fail()
Definition: checkasm.h:131
VERT_PRED
@ VERT_PRED
Definition: vp9.h:46
trunc
static __device__ float trunc(float a)
Definition: cuda_runtime.h:179
SIZEOF_PIXEL
#define SIZEOF_PIXEL
Definition: vp9dsp.c:33
checkasm.h
DIAG_DOWN_RIGHT_PRED
@ DIAG_DOWN_RIGHT_PRED
Definition: vp9.h:50
copy_subcoefs
static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx, enum TxfmType txtp, int sz, int sub, int bit_depth)
Definition: vp9dsp.c:252
check_mc
static void check_mc(void)
Definition: vp9dsp.c:558
fadst_1d
static void fadst_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:167
setsx
#define setsx(a, b, c, d)
Definition: vp9dsp.c:391
lrint
#define lrint
Definition: tablegen.h:53
rnd
#define rnd()
Definition: checkasm.h:115
SIZEOF_COEF
#define SIZEOF_COEF
Definition: vp9dsp.c:308
HOR_PRED
@ HOR_PRED
Definition: vp9.h:47
mask
static const uint16_t mask[17]
Definition: lzw.c:38
intreadwrite.h
src
#define src
Definition: vp9dsp.c:539
AV_ZERO32
#define AV_ZERO32(d)
Definition: intreadwrite.h:629
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
ff_vp9_scans
const int16_t *const ff_vp9_scans[5][4]
Definition: vp9data.c:600
vp9data.h
SRC_BUF_STRIDE
#define SRC_BUF_STRIDE
Definition: vp9dsp.c:537
LEFT_DC_PRED
@ LEFT_DC_PRED
Definition: vp9.h:56
check_loopfilter
static void check_loopfilter(void)
Definition: vp9dsp.c:451
ff_vp9dsp_init
av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
Definition: vp9dsp.c:88
ftx_2d
static void ftx_2d(double *out, const double *in, enum TxfmMode tx, enum TxfmType txtp, int sz)
Definition: vp9dsp.c:179
ftx
static void ftx(int16_t *buf, enum TxfmMode tx, enum TxfmType txtp, int sz, int bit_depth)
Definition: vp9dsp.c:230
q0
static const uint8_t q0[256]
Definition: twofish.c:81
E
#define E
Definition: avdct.c:32
DCT_ADST
@ DCT_ADST
Definition: vp9.h:39
call_new
#define call_new(...)
Definition: checkasm.h:209
LOCAL_ALIGNED_32
#define LOCAL_ALIGNED_32(t, v,...)
Definition: mem_internal.h:136
M
#define M(a)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
VP9DSPContext::itxfm_add
void(* itxfm_add[N_TXFM_SIZES+1][N_TXFM_TYPES])(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)
Definition: vp9dsp.h:70
VP9DSPContext::intra_pred
void(* intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp.h:51
TxfmMode
TxfmMode
Definition: vp9.h:27
vp9.h
DCT_DCT
@ DCT_DCT
Definition: vp9.h:38
TxfmType
TxfmType
Definition: vp9.h:37
pixel_mask
static const uint32_t pixel_mask[3]
Definition: vp9dsp.c:32
N_INTRA_PRED_MODES
@ N_INTRA_PRED_MODES
Definition: vp9.h:61
size
int size
Definition: twinvq_data.h:10344
VERT_RIGHT_PRED
@ VERT_RIGHT_PRED
Definition: vp9.h:51
VP9DSPContext::mc
vp9_mc_func mc[5][N_FILTERS][2][2][2]
Definition: vp9dsp.h:114
TX_4X4
@ TX_4X4
Definition: vp9.h:28
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
H
#define H
Definition: pixlet.c:40
M_PI
#define M_PI
Definition: mathematics.h:52
ftx1d_fn
void(* ftx1d_fn)(double *out, const double *in, int sz)
Definition: vp9dsp.c:178
randomize_buffers
#define randomize_buffers()
Definition: vp9dsp.c:541
report
#define report
Definition: checkasm.h:134
is_zero
static int is_zero(const int16_t *c, int sz)
Definition: vp9dsp.c:297
bench_new
#define bench_new(...)
Definition: checkasm.h:272
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
fdct_1d
static void fdct_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:138
DC_129_PRED
@ DC_129_PRED
Definition: vp9.h:60
internal.h
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
common.h
ADST_ADST
@ ADST_ADST
Definition: vp9.h:41
AV_COPY32
#define AV_COPY32(d, s)
Definition: intreadwrite.h:601
AV_RN32A
#define AV_RN32A(p)
Definition: intreadwrite.h:526
stride
#define stride
Definition: h264pred_template.c:537
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
M_SQRT1_2
#define M_SQRT1_2
Definition: mathematics.h:58
AV_CPU_FLAG_MMX
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:29
t2
#define t2
Definition: regdef.h:30
HOR_UP_PRED
@ HOR_UP_PRED
Definition: vp9.h:54
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:112
temp
else temp
Definition: vf_mcdeint.c:248
HOR_DOWN_PRED
@ HOR_DOWN_PRED
Definition: vp9.h:52
AV_CPU_FLAG_MMXEXT
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:30
TX_32X32
@ TX_32X32
Definition: vp9.h:31
TOP_DC_PRED
@ TOP_DC_PRED
Definition: vp9.h:57
ADST_DCT
@ ADST_DCT
Definition: vp9.h:40
DIAG_DOWN_LEFT_PRED
@ DIAG_DOWN_LEFT_PRED
Definition: vp9.h:49
int32_t
int32_t
Definition: audioconvert.c:56
convert_header.str
string str
Definition: convert_header.py:20
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
VP9DSPContext::loop_filter_16
void(* loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:88
snprintf
#define snprintf
Definition: snprintf.h:34
checkasm_check_vp9dsp
void checkasm_check_vp9dsp(void)
Definition: vp9dsp.c:626
check_ipred
static void check_ipred(void)
Definition: vp9dsp.c:49
DST_BUF_SIZE
#define DST_BUF_SIZE
Definition: vp9dsp.c:536