FFmpeg
vp9dsp_template.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/common.h"
25 #include "bit_depth_template.c"
26 #include "vp9dsp.h"
27 
28 #if BIT_DEPTH != 12
29 
30 // FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
31 // back with h264pred.[ch]
32 
33 static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride,
34  const uint8_t *left, const uint8_t *_top)
35 {
36  pixel *dst = (pixel *) _dst;
37  const pixel *top = (const pixel *) _top;
38  pixel4 p4 = AV_RN4PA(top);
39 
40  stride /= sizeof(pixel);
41  AV_WN4PA(dst + stride * 0, p4);
42  AV_WN4PA(dst + stride * 1, p4);
43  AV_WN4PA(dst + stride * 2, p4);
44  AV_WN4PA(dst + stride * 3, p4);
45 }
46 
47 static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride,
48  const uint8_t *left, const uint8_t *_top)
49 {
50  pixel *dst = (pixel *) _dst;
51  const pixel *top = (const pixel *) _top;
52  pixel4 p4a = AV_RN4PA(top + 0);
53  pixel4 p4b = AV_RN4PA(top + 4);
54  int y;
55 
56  stride /= sizeof(pixel);
57  for (y = 0; y < 8; y++) {
58  AV_WN4PA(dst + 0, p4a);
59  AV_WN4PA(dst + 4, p4b);
60  dst += stride;
61  }
62 }
63 
64 static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride,
65  const uint8_t *left, const uint8_t *_top)
66 {
67  pixel *dst = (pixel *) _dst;
68  const pixel *top = (const pixel *) _top;
69  pixel4 p4a = AV_RN4PA(top + 0);
70  pixel4 p4b = AV_RN4PA(top + 4);
71  pixel4 p4c = AV_RN4PA(top + 8);
72  pixel4 p4d = AV_RN4PA(top + 12);
73  int y;
74 
75  stride /= sizeof(pixel);
76  for (y = 0; y < 16; y++) {
77  AV_WN4PA(dst + 0, p4a);
78  AV_WN4PA(dst + 4, p4b);
79  AV_WN4PA(dst + 8, p4c);
80  AV_WN4PA(dst + 12, p4d);
81  dst += stride;
82  }
83 }
84 
85 static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride,
86  const uint8_t *left, const uint8_t *_top)
87 {
88  pixel *dst = (pixel *) _dst;
89  const pixel *top = (const pixel *) _top;
90  pixel4 p4a = AV_RN4PA(top + 0);
91  pixel4 p4b = AV_RN4PA(top + 4);
92  pixel4 p4c = AV_RN4PA(top + 8);
93  pixel4 p4d = AV_RN4PA(top + 12);
94  pixel4 p4e = AV_RN4PA(top + 16);
95  pixel4 p4f = AV_RN4PA(top + 20);
96  pixel4 p4g = AV_RN4PA(top + 24);
97  pixel4 p4h = AV_RN4PA(top + 28);
98  int y;
99 
100  stride /= sizeof(pixel);
101  for (y = 0; y < 32; y++) {
102  AV_WN4PA(dst + 0, p4a);
103  AV_WN4PA(dst + 4, p4b);
104  AV_WN4PA(dst + 8, p4c);
105  AV_WN4PA(dst + 12, p4d);
106  AV_WN4PA(dst + 16, p4e);
107  AV_WN4PA(dst + 20, p4f);
108  AV_WN4PA(dst + 24, p4g);
109  AV_WN4PA(dst + 28, p4h);
110  dst += stride;
111  }
112 }
113 
114 static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
115  const uint8_t *_left, const uint8_t *top)
116 {
117  pixel *dst = (pixel *) _dst;
118  const pixel *left = (const pixel *) _left;
119 
120  stride /= sizeof(pixel);
121  AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
122  AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
123  AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
124  AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
125 }
126 
127 static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
128  const uint8_t *_left, const uint8_t *top)
129 {
130  pixel *dst = (pixel *) _dst;
131  const pixel *left = (const pixel *) _left;
132  int y;
133 
134  stride /= sizeof(pixel);
135  for (y = 0; y < 8; y++) {
136  pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
137 
138  AV_WN4PA(dst + 0, p4);
139  AV_WN4PA(dst + 4, p4);
140  dst += stride;
141  }
142 }
143 
144 static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
145  const uint8_t *_left, const uint8_t *top)
146 {
147  pixel *dst = (pixel *) _dst;
148  const pixel *left = (const pixel *) _left;
149  int y;
150 
151  stride /= sizeof(pixel);
152  for (y = 0; y < 16; y++) {
153  pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
154 
155  AV_WN4PA(dst + 0, p4);
156  AV_WN4PA(dst + 4, p4);
157  AV_WN4PA(dst + 8, p4);
158  AV_WN4PA(dst + 12, p4);
159  dst += stride;
160  }
161 }
162 
163 static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
164  const uint8_t *_left, const uint8_t *top)
165 {
166  pixel *dst = (pixel *) _dst;
167  const pixel *left = (const pixel *) _left;
168  int y;
169 
170  stride /= sizeof(pixel);
171  for (y = 0; y < 32; y++) {
172  pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
173 
174  AV_WN4PA(dst + 0, p4);
175  AV_WN4PA(dst + 4, p4);
176  AV_WN4PA(dst + 8, p4);
177  AV_WN4PA(dst + 12, p4);
178  AV_WN4PA(dst + 16, p4);
179  AV_WN4PA(dst + 20, p4);
180  AV_WN4PA(dst + 24, p4);
181  AV_WN4PA(dst + 28, p4);
182  dst += stride;
183  }
184 }
185 
186 #endif /* BIT_DEPTH != 12 */
187 
188 static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
189  const uint8_t *_left, const uint8_t *_top)
190 {
191  pixel *dst = (pixel *) _dst;
192  const pixel *left = (const pixel *) _left;
193  const pixel *top = (const pixel *) _top;
194  int y, tl = top[-1];
195 
196  stride /= sizeof(pixel);
197  for (y = 0; y < 4; y++) {
198  int l_m_tl = left[3 - y] - tl;
199 
200  dst[0] = av_clip_pixel(top[0] + l_m_tl);
201  dst[1] = av_clip_pixel(top[1] + l_m_tl);
202  dst[2] = av_clip_pixel(top[2] + l_m_tl);
203  dst[3] = av_clip_pixel(top[3] + l_m_tl);
204  dst += stride;
205  }
206 }
207 
208 static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
209  const uint8_t *_left, const uint8_t *_top)
210 {
211  pixel *dst = (pixel *) _dst;
212  const pixel *left = (const pixel *) _left;
213  const pixel *top = (const pixel *) _top;
214  int y, tl = top[-1];
215 
216  stride /= sizeof(pixel);
217  for (y = 0; y < 8; y++) {
218  int l_m_tl = left[7 - y] - tl;
219 
220  dst[0] = av_clip_pixel(top[0] + l_m_tl);
221  dst[1] = av_clip_pixel(top[1] + l_m_tl);
222  dst[2] = av_clip_pixel(top[2] + l_m_tl);
223  dst[3] = av_clip_pixel(top[3] + l_m_tl);
224  dst[4] = av_clip_pixel(top[4] + l_m_tl);
225  dst[5] = av_clip_pixel(top[5] + l_m_tl);
226  dst[6] = av_clip_pixel(top[6] + l_m_tl);
227  dst[7] = av_clip_pixel(top[7] + l_m_tl);
228  dst += stride;
229  }
230 }
231 
232 static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
233  const uint8_t *_left, const uint8_t *_top)
234 {
235  pixel *dst = (pixel *) _dst;
236  const pixel *left = (const pixel *) _left;
237  const pixel *top = (const pixel *) _top;
238  int y, tl = top[-1];
239 
240  stride /= sizeof(pixel);
241  for (y = 0; y < 16; y++) {
242  int l_m_tl = left[15 - y] - tl;
243 
244  dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
245  dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
246  dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
247  dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
248  dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
249  dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
250  dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
251  dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
252  dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
253  dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
254  dst[10] = av_clip_pixel(top[10] + l_m_tl);
255  dst[11] = av_clip_pixel(top[11] + l_m_tl);
256  dst[12] = av_clip_pixel(top[12] + l_m_tl);
257  dst[13] = av_clip_pixel(top[13] + l_m_tl);
258  dst[14] = av_clip_pixel(top[14] + l_m_tl);
259  dst[15] = av_clip_pixel(top[15] + l_m_tl);
260  dst += stride;
261  }
262 }
263 
264 static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
265  const uint8_t *_left, const uint8_t *_top)
266 {
267  pixel *dst = (pixel *) _dst;
268  const pixel *left = (const pixel *) _left;
269  const pixel *top = (const pixel *) _top;
270  int y, tl = top[-1];
271 
272  stride /= sizeof(pixel);
273  for (y = 0; y < 32; y++) {
274  int l_m_tl = left[31 - y] - tl;
275 
276  dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
277  dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
278  dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
279  dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
280  dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
281  dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
282  dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
283  dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
284  dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
285  dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
286  dst[10] = av_clip_pixel(top[10] + l_m_tl);
287  dst[11] = av_clip_pixel(top[11] + l_m_tl);
288  dst[12] = av_clip_pixel(top[12] + l_m_tl);
289  dst[13] = av_clip_pixel(top[13] + l_m_tl);
290  dst[14] = av_clip_pixel(top[14] + l_m_tl);
291  dst[15] = av_clip_pixel(top[15] + l_m_tl);
292  dst[16] = av_clip_pixel(top[16] + l_m_tl);
293  dst[17] = av_clip_pixel(top[17] + l_m_tl);
294  dst[18] = av_clip_pixel(top[18] + l_m_tl);
295  dst[19] = av_clip_pixel(top[19] + l_m_tl);
296  dst[20] = av_clip_pixel(top[20] + l_m_tl);
297  dst[21] = av_clip_pixel(top[21] + l_m_tl);
298  dst[22] = av_clip_pixel(top[22] + l_m_tl);
299  dst[23] = av_clip_pixel(top[23] + l_m_tl);
300  dst[24] = av_clip_pixel(top[24] + l_m_tl);
301  dst[25] = av_clip_pixel(top[25] + l_m_tl);
302  dst[26] = av_clip_pixel(top[26] + l_m_tl);
303  dst[27] = av_clip_pixel(top[27] + l_m_tl);
304  dst[28] = av_clip_pixel(top[28] + l_m_tl);
305  dst[29] = av_clip_pixel(top[29] + l_m_tl);
306  dst[30] = av_clip_pixel(top[30] + l_m_tl);
307  dst[31] = av_clip_pixel(top[31] + l_m_tl);
308  dst += stride;
309  }
310 }
311 
312 #if BIT_DEPTH != 12
313 
314 static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
315  const uint8_t *_left, const uint8_t *_top)
316 {
317  pixel *dst = (pixel *) _dst;
318  const pixel *left = (const pixel *) _left;
319  const pixel *top = (const pixel *) _top;
320  pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
321  top[0] + top[1] + top[2] + top[3] + 4) >> 3);
322 
323  stride /= sizeof(pixel);
324  AV_WN4PA(dst + stride * 0, dc);
325  AV_WN4PA(dst + stride * 1, dc);
326  AV_WN4PA(dst + stride * 2, dc);
327  AV_WN4PA(dst + stride * 3, dc);
328 }
329 
330 static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
331  const uint8_t *_left, const uint8_t *_top)
332 {
333  pixel *dst = (pixel *) _dst;
334  const pixel *left = (const pixel *) _left;
335  const pixel *top = (const pixel *) _top;
337  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
338  left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
339  top[4] + top[5] + top[6] + top[7] + 8) >> 4);
340  int y;
341 
342  stride /= sizeof(pixel);
343  for (y = 0; y < 8; y++) {
344  AV_WN4PA(dst + 0, dc);
345  AV_WN4PA(dst + 4, dc);
346  dst += stride;
347  }
348 }
349 
350 static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
351  const uint8_t *_left, const uint8_t *_top)
352 {
353  pixel *dst = (pixel *) _dst;
354  const pixel *left = (const pixel *) _left;
355  const pixel *top = (const pixel *) _top;
357  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
358  left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
359  left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
360  top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
361  top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
362  int y;
363 
364  stride /= sizeof(pixel);
365  for (y = 0; y < 16; y++) {
366  AV_WN4PA(dst + 0, dc);
367  AV_WN4PA(dst + 4, dc);
368  AV_WN4PA(dst + 8, dc);
369  AV_WN4PA(dst + 12, dc);
370  dst += stride;
371  }
372 }
373 
374 static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
375  const uint8_t *_left, const uint8_t *_top)
376 {
377  pixel *dst = (pixel *) _dst;
378  const pixel *left = (const pixel *) _left;
379  const pixel *top = (const pixel *) _top;
381  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
382  left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
383  left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
384  left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
385  left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
386  left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
387  top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
388  top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
389  top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
390  top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
391  int y;
392 
393  stride /= sizeof(pixel);
394  for (y = 0; y < 32; y++) {
395  AV_WN4PA(dst + 0, dc);
396  AV_WN4PA(dst + 4, dc);
397  AV_WN4PA(dst + 8, dc);
398  AV_WN4PA(dst + 12, dc);
399  AV_WN4PA(dst + 16, dc);
400  AV_WN4PA(dst + 20, dc);
401  AV_WN4PA(dst + 24, dc);
402  AV_WN4PA(dst + 28, dc);
403  dst += stride;
404  }
405 }
406 
407 static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
408  const uint8_t *_left, const uint8_t *top)
409 {
410  pixel *dst = (pixel *) _dst;
411  const pixel *left = (const pixel *) _left;
412  pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
413 
414  stride /= sizeof(pixel);
415  AV_WN4PA(dst + stride * 0, dc);
416  AV_WN4PA(dst + stride * 1, dc);
417  AV_WN4PA(dst + stride * 2, dc);
418  AV_WN4PA(dst + stride * 3, dc);
419 }
420 
421 static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
422  const uint8_t *_left, const uint8_t *top)
423 {
424  pixel *dst = (pixel *) _dst;
425  const pixel *left = (const pixel *) _left;
427  ((left[0] + left[1] + left[2] + left[3] +
428  left[4] + left[5] + left[6] + left[7] + 4) >> 3);
429  int y;
430 
431  stride /= sizeof(pixel);
432  for (y = 0; y < 8; y++) {
433  AV_WN4PA(dst + 0, dc);
434  AV_WN4PA(dst + 4, dc);
435  dst += stride;
436  }
437 }
438 
439 static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
440  const uint8_t *_left, const uint8_t *top)
441 {
442  pixel *dst = (pixel *) _dst;
443  const pixel *left = (const pixel *) _left;
445  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
446  left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
447  left[12] + left[13] + left[14] + left[15] + 8) >> 4);
448  int y;
449 
450  stride /= sizeof(pixel);
451  for (y = 0; y < 16; y++) {
452  AV_WN4PA(dst + 0, dc);
453  AV_WN4PA(dst + 4, dc);
454  AV_WN4PA(dst + 8, dc);
455  AV_WN4PA(dst + 12, dc);
456  dst += stride;
457  }
458 }
459 
460 static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
461  const uint8_t *_left, const uint8_t *top)
462 {
463  pixel *dst = (pixel *) _dst;
464  const pixel *left = (const pixel *) _left;
466  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
467  left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
468  left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
469  left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
470  left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
471  left[30] + left[31] + 16) >> 5);
472  int y;
473 
474  stride /= sizeof(pixel);
475  for (y = 0; y < 32; y++) {
476  AV_WN4PA(dst + 0, dc);
477  AV_WN4PA(dst + 4, dc);
478  AV_WN4PA(dst + 8, dc);
479  AV_WN4PA(dst + 12, dc);
480  AV_WN4PA(dst + 16, dc);
481  AV_WN4PA(dst + 20, dc);
482  AV_WN4PA(dst + 24, dc);
483  AV_WN4PA(dst + 28, dc);
484  dst += stride;
485  }
486 }
487 
488 static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
489  const uint8_t *left, const uint8_t *_top)
490 {
491  pixel *dst = (pixel *) _dst;
492  const pixel *top = (const pixel *) _top;
493  pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
494 
495  stride /= sizeof(pixel);
496  AV_WN4PA(dst + stride * 0, dc);
497  AV_WN4PA(dst + stride * 1, dc);
498  AV_WN4PA(dst + stride * 2, dc);
499  AV_WN4PA(dst + stride * 3, dc);
500 }
501 
502 static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
503  const uint8_t *left, const uint8_t *_top)
504 {
505  pixel *dst = (pixel *) _dst;
506  const pixel *top = (const pixel *) _top;
508  ((top[0] + top[1] + top[2] + top[3] +
509  top[4] + top[5] + top[6] + top[7] + 4) >> 3);
510  int y;
511 
512  stride /= sizeof(pixel);
513  for (y = 0; y < 8; y++) {
514  AV_WN4PA(dst + 0, dc);
515  AV_WN4PA(dst + 4, dc);
516  dst += stride;
517  }
518 }
519 
520 static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
521  const uint8_t *left, const uint8_t *_top)
522 {
523  pixel *dst = (pixel *) _dst;
524  const pixel *top = (const pixel *) _top;
526  ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
527  top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
528  top[12] + top[13] + top[14] + top[15] + 8) >> 4);
529  int y;
530 
531  stride /= sizeof(pixel);
532  for (y = 0; y < 16; y++) {
533  AV_WN4PA(dst + 0, dc);
534  AV_WN4PA(dst + 4, dc);
535  AV_WN4PA(dst + 8, dc);
536  AV_WN4PA(dst + 12, dc);
537  dst += stride;
538  }
539 }
540 
541 static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
542  const uint8_t *left, const uint8_t *_top)
543 {
544  pixel *dst = (pixel *) _dst;
545  const pixel *top = (const pixel *) _top;
547  ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
548  top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
549  top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
550  top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
551  top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
552  top[30] + top[31] + 16) >> 5);
553  int y;
554 
555  stride /= sizeof(pixel);
556  for (y = 0; y < 32; y++) {
557  AV_WN4PA(dst + 0, dc);
558  AV_WN4PA(dst + 4, dc);
559  AV_WN4PA(dst + 8, dc);
560  AV_WN4PA(dst + 12, dc);
561  AV_WN4PA(dst + 16, dc);
562  AV_WN4PA(dst + 20, dc);
563  AV_WN4PA(dst + 24, dc);
564  AV_WN4PA(dst + 28, dc);
565  dst += stride;
566  }
567 }
568 
569 #endif /* BIT_DEPTH != 12 */
570 
571 static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
572  const uint8_t *left, const uint8_t *top)
573 {
574  pixel *dst = (pixel *) _dst;
575  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
576 
577  stride /= sizeof(pixel);
578  AV_WN4PA(dst + stride * 0, val);
579  AV_WN4PA(dst + stride * 1, val);
580  AV_WN4PA(dst + stride * 2, val);
581  AV_WN4PA(dst + stride * 3, val);
582 }
583 
584 static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
585  const uint8_t *left, const uint8_t *top)
586 {
587  pixel *dst = (pixel *) _dst;
588  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
589  int y;
590 
591  stride /= sizeof(pixel);
592  for (y = 0; y < 8; y++) {
593  AV_WN4PA(dst + 0, val);
594  AV_WN4PA(dst + 4, val);
595  dst += stride;
596  }
597 }
598 
599 static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
600  const uint8_t *left, const uint8_t *top)
601 {
602  pixel *dst = (pixel *) _dst;
603  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
604  int y;
605 
606  stride /= sizeof(pixel);
607  for (y = 0; y < 16; y++) {
608  AV_WN4PA(dst + 0, val);
609  AV_WN4PA(dst + 4, val);
610  AV_WN4PA(dst + 8, val);
611  AV_WN4PA(dst + 12, val);
612  dst += stride;
613  }
614 }
615 
616 static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
617  const uint8_t *left, const uint8_t *top)
618 {
619  pixel *dst = (pixel *) _dst;
620  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
621  int y;
622 
623  stride /= sizeof(pixel);
624  for (y = 0; y < 32; y++) {
625  AV_WN4PA(dst + 0, val);
626  AV_WN4PA(dst + 4, val);
627  AV_WN4PA(dst + 8, val);
628  AV_WN4PA(dst + 12, val);
629  AV_WN4PA(dst + 16, val);
630  AV_WN4PA(dst + 20, val);
631  AV_WN4PA(dst + 24, val);
632  AV_WN4PA(dst + 28, val);
633  dst += stride;
634  }
635 }
636 
637 static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
638  const uint8_t *left, const uint8_t *top)
639 {
640  pixel *dst = (pixel *) _dst;
641  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
642 
643  stride /= sizeof(pixel);
644  AV_WN4PA(dst + stride * 0, val);
645  AV_WN4PA(dst + stride * 1, val);
646  AV_WN4PA(dst + stride * 2, val);
647  AV_WN4PA(dst + stride * 3, val);}
648 
649 static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
650  const uint8_t *left, const uint8_t *top)
651 {
652  pixel *dst = (pixel *) _dst;
653  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
654  int y;
655 
656  stride /= sizeof(pixel);
657  for (y = 0; y < 8; y++) {
658  AV_WN4PA(dst + 0, val);
659  AV_WN4PA(dst + 4, val);
660  dst += stride;
661  }
662 }
663 
664 static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
665  const uint8_t *left, const uint8_t *top)
666 {
667  pixel *dst = (pixel *) _dst;
668  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
669  int y;
670 
671  stride /= sizeof(pixel);
672  for (y = 0; y < 16; y++) {
673  AV_WN4PA(dst + 0, val);
674  AV_WN4PA(dst + 4, val);
675  AV_WN4PA(dst + 8, val);
676  AV_WN4PA(dst + 12, val);
677  dst += stride;
678  }
679 }
680 
681 static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
682  const uint8_t *left, const uint8_t *top)
683 {
684  pixel *dst = (pixel *) _dst;
685  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
686  int y;
687 
688  stride /= sizeof(pixel);
689  for (y = 0; y < 32; y++) {
690  AV_WN4PA(dst + 0, val);
691  AV_WN4PA(dst + 4, val);
692  AV_WN4PA(dst + 8, val);
693  AV_WN4PA(dst + 12, val);
694  AV_WN4PA(dst + 16, val);
695  AV_WN4PA(dst + 20, val);
696  AV_WN4PA(dst + 24, val);
697  AV_WN4PA(dst + 28, val);
698  dst += stride;
699  }
700 }
701 
702 static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
703  const uint8_t *left, const uint8_t *top)
704 {
705  pixel *dst = (pixel *) _dst;
706  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
707 
708  stride /= sizeof(pixel);
709  AV_WN4PA(dst + stride * 0, val);
710  AV_WN4PA(dst + stride * 1, val);
711  AV_WN4PA(dst + stride * 2, val);
712  AV_WN4PA(dst + stride * 3, val);
713 }
714 
715 static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
716  const uint8_t *left, const uint8_t *top)
717 {
718  pixel *dst = (pixel *) _dst;
719  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
720  int y;
721 
722  stride /= sizeof(pixel);
723  for (y = 0; y < 8; y++) {
724  AV_WN4PA(dst + 0, val);
725  AV_WN4PA(dst + 4, val);
726  dst += stride;
727  }
728 }
729 
730 static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
731  const uint8_t *left, const uint8_t *top)
732 {
733  pixel *dst = (pixel *) _dst;
734  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
735  int y;
736 
737  stride /= sizeof(pixel);
738  for (y = 0; y < 16; y++) {
739  AV_WN4PA(dst + 0, val);
740  AV_WN4PA(dst + 4, val);
741  AV_WN4PA(dst + 8, val);
742  AV_WN4PA(dst + 12, val);
743  dst += stride;
744  }
745 }
746 
747 static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
748  const uint8_t *left, const uint8_t *top)
749 {
750  pixel *dst = (pixel *) _dst;
751  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
752  int y;
753 
754  stride /= sizeof(pixel);
755  for (y = 0; y < 32; y++) {
756  AV_WN4PA(dst + 0, val);
757  AV_WN4PA(dst + 4, val);
758  AV_WN4PA(dst + 8, val);
759  AV_WN4PA(dst + 12, val);
760  AV_WN4PA(dst + 16, val);
761  AV_WN4PA(dst + 20, val);
762  AV_WN4PA(dst + 24, val);
763  AV_WN4PA(dst + 28, val);
764  dst += stride;
765  }
766 }
767 
768 #if BIT_DEPTH != 12
769 
770 #if BIT_DEPTH == 8
771 #define memset_bpc memset
772 #else
773 static inline void memset_bpc(uint16_t *dst, int val, int len) {
774  int n;
775  for (n = 0; n < len; n++) {
776  dst[n] = val;
777  }
778 }
779 #endif
780 
781 #define DST(x, y) dst[(x) + (y) * stride]
782 
783 static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
784  const uint8_t *left, const uint8_t *_top)
785 {
786  pixel *dst = (pixel *) _dst;
787  const pixel *top = (const pixel *) _top;
788  int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
789  a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
790 
791  stride /= sizeof(pixel);
792  DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
793  DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
794  DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
795  DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
796  DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
797  DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
798  DST(3,3) = a7; // note: this is different from vp8 and such
799 }
800 
801 #define def_diag_downleft(size) \
802 static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
803  const uint8_t *left, const uint8_t *_top) \
804 { \
805  pixel *dst = (pixel *) _dst; \
806  const pixel *top = (const pixel *) _top; \
807  int i, j; \
808  pixel v[size - 1]; \
809 \
810  stride /= sizeof(pixel); \
811  for (i = 0; i < size - 2; i++) \
812  v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
813  v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
814 \
815  for (j = 0; j < size; j++) { \
816  memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
817  memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
818  } \
819 }
820 
824 
825 static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
826  const uint8_t *_left, const uint8_t *_top)
827 {
828  pixel *dst = (pixel *) _dst;
829  const pixel *top = (const pixel *) _top;
830  const pixel *left = (const pixel *) _left;
831  int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
832  l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
833 
834  stride /= sizeof(pixel);
835  DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
836  DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
837  DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
838  DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
839  DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
840  DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
841  DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
842 }
843 
844 #define def_diag_downright(size) \
845 static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
846  const uint8_t *_left, const uint8_t *_top) \
847 { \
848  pixel *dst = (pixel *) _dst; \
849  const pixel *top = (const pixel *) _top; \
850  const pixel *left = (const pixel *) _left; \
851  int i, j; \
852  pixel v[size + size - 1]; \
853 \
854  stride /= sizeof(pixel); \
855  for (i = 0; i < size - 2; i++) { \
856  v[i ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
857  v[size + 1 + i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
858  } \
859  v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
860  v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
861  v[size ] = (top[-1] + top[0] * 2 + top[ 1] + 2) >> 2; \
862 \
863  for (j = 0; j < size; j++) \
864  memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
865 }
866 
870 
871 static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
872  const uint8_t *_left, const uint8_t *_top)
873 {
874  pixel *dst = (pixel *) _dst;
875  const pixel *top = (const pixel *) _top;
876  const pixel *left = (const pixel *) _left;
877  int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
878  l0 = left[3], l1 = left[2], l2 = left[1];
879 
880  stride /= sizeof(pixel);
881  DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
882  DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
883  DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
884  DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
885  DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
886  DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
887  DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
888  DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
889  DST(3,0) = (a2 + a3 + 1) >> 1;
890  DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
891 }
892 
893 #define def_vert_right(size) \
894 static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
895  const uint8_t *_left, const uint8_t *_top) \
896 { \
897  pixel *dst = (pixel *) _dst; \
898  const pixel *top = (const pixel *) _top; \
899  const pixel *left = (const pixel *) _left; \
900  int i, j; \
901  pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
902 \
903  stride /= sizeof(pixel); \
904  for (i = 0; i < size/2 - 2; i++) { \
905  vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
906  ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
907  } \
908  vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
909  ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
910 \
911  ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
912  vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
913  for (i = 0; i < size - 1; i++) { \
914  ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
915  vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
916  } \
917 \
918  for (j = 0; j < size / 2; j++) { \
919  memcpy(dst + j*2 *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
920  memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
921  } \
922 }
923 
925 def_vert_right(16)
926 def_vert_right(32)
927 
928 static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
929  const uint8_t *_left, const uint8_t *_top)
930 {
931  pixel *dst = (pixel *) _dst;
932  const pixel *top = (const pixel *) _top;
933  const pixel *left = (const pixel *) _left;
934  int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
935  tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
936 
937  stride /= sizeof(pixel);
938  DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
939  DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
940  DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
941  DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
942  DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
943  DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
944  DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
945  DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
946  DST(0,3) = (l2 + l3 + 1) >> 1;
947  DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
948 }
949 
950 #define def_hor_down(size) \
951 static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
952  const uint8_t *_left, const uint8_t *_top) \
953 { \
954  pixel *dst = (pixel *) _dst; \
955  const pixel *top = (const pixel *) _top; \
956  const pixel *left = (const pixel *) _left; \
957  int i, j; \
958  pixel v[size * 3 - 2]; \
959 \
960  stride /= sizeof(pixel); \
961  for (i = 0; i < size - 2; i++) { \
962  v[i*2 ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
963  v[i*2 + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
964  v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
965  } \
966  v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
967  v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
968  v[size*2 - 1] = (top[0] + top[-1] * 2 + left[size - 1] + 2) >> 2; \
969  v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
970 \
971  for (j = 0; j < size; j++) \
972  memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
973 }
974 
976 def_hor_down(16)
977 def_hor_down(32)
978 
979 static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
980  const uint8_t *left, const uint8_t *_top)
981 {
982  pixel *dst = (pixel *) _dst;
983  const pixel *top = (const pixel *) _top;
984  int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
985  a4 = top[4], a5 = top[5], a6 = top[6];
986 
987  stride /= sizeof(pixel);
988  DST(0,0) = (a0 + a1 + 1) >> 1;
989  DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
990  DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
991  DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
992  DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
993  DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
994  DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
995  DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
996  DST(3,2) = (a4 + a5 + 1) >> 1;
997  DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
998 }
999 
1000 #define def_vert_left(size) \
1001 static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
1002  const uint8_t *left, const uint8_t *_top) \
1003 { \
1004  pixel *dst = (pixel *) _dst; \
1005  const pixel *top = (const pixel *) _top; \
1006  int i, j; \
1007  pixel ve[size - 1], vo[size - 1]; \
1008 \
1009  stride /= sizeof(pixel); \
1010  for (i = 0; i < size - 2; i++) { \
1011  ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
1012  vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
1013  } \
1014  ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
1015  vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
1016 \
1017  for (j = 0; j < size / 2; j++) { \
1018  memcpy(dst + j*2 * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
1019  memset_bpc(dst + j*2 * stride + size - j - 1, top[size - 1], j + 1); \
1020  memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
1021  memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
1022  } \
1023 }
1024 
1026 def_vert_left(16)
1027 def_vert_left(32)
1028 
1029 static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
1030  const uint8_t *_left, const uint8_t *top)
1031 {
1032  pixel *dst = (pixel *) _dst;
1033  const pixel *left = (const pixel *) _left;
1034  int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
1035 
1036  stride /= sizeof(pixel);
1037  DST(0,0) = (l0 + l1 + 1) >> 1;
1038  DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
1039  DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
1040  DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
1041  DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
1042  DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
1043  DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
1044 }
1045 
1046 #define def_hor_up(size) \
1047 static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
1048  const uint8_t *_left, const uint8_t *top) \
1049 { \
1050  pixel *dst = (pixel *) _dst; \
1051  const pixel *left = (const pixel *) _left; \
1052  int i, j; \
1053  pixel v[size*2 - 2]; \
1054 \
1055  stride /= sizeof(pixel); \
1056  for (i = 0; i < size - 2; i++) { \
1057  v[i*2 ] = (left[i] + left[i + 1] + 1) >> 1; \
1058  v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
1059  } \
1060  v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
1061  v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
1062 \
1063  for (j = 0; j < size / 2; j++) \
1064  memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
1065  for (j = size / 2; j < size; j++) { \
1066  memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
1067  memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
1068  2 + j*2 - size); \
1069  } \
1070 }
1071 
1073 def_hor_up(16)
1074 def_hor_up(32)
1075 
1076 #undef DST
1077 
1078 #endif /* BIT_DEPTH != 12 */
1079 
1080 #if BIT_DEPTH != 8
1081 void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
1082 #endif
1083 #if BIT_DEPTH != 10
1084 static
1085 #endif
1086 av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
1087 {
1088 #define init_intra_pred_bd_aware(tx, sz) \
1089  dsp->intra_pred[tx][TM_VP8_PRED] = tm_##sz##_c; \
1090  dsp->intra_pred[tx][DC_128_PRED] = dc_128_##sz##_c; \
1091  dsp->intra_pred[tx][DC_127_PRED] = dc_127_##sz##_c; \
1092  dsp->intra_pred[tx][DC_129_PRED] = dc_129_##sz##_c
1093 
1094 #if BIT_DEPTH == 12
1095  ff_vp9dsp_intrapred_init_10(dsp);
1096 #define init_intra_pred(tx, sz) \
1097  init_intra_pred_bd_aware(tx, sz)
1098 #else
1099  #define init_intra_pred(tx, sz) \
1100  dsp->intra_pred[tx][VERT_PRED] = vert_##sz##_c; \
1101  dsp->intra_pred[tx][HOR_PRED] = hor_##sz##_c; \
1102  dsp->intra_pred[tx][DC_PRED] = dc_##sz##_c; \
1103  dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = diag_downleft_##sz##_c; \
1104  dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
1105  dsp->intra_pred[tx][VERT_RIGHT_PRED] = vert_right_##sz##_c; \
1106  dsp->intra_pred[tx][HOR_DOWN_PRED] = hor_down_##sz##_c; \
1107  dsp->intra_pred[tx][VERT_LEFT_PRED] = vert_left_##sz##_c; \
1108  dsp->intra_pred[tx][HOR_UP_PRED] = hor_up_##sz##_c; \
1109  dsp->intra_pred[tx][LEFT_DC_PRED] = dc_left_##sz##_c; \
1110  dsp->intra_pred[tx][TOP_DC_PRED] = dc_top_##sz##_c; \
1111  init_intra_pred_bd_aware(tx, sz)
1112 #endif
1113 
1114  init_intra_pred(TX_4X4, 4x4);
1115  init_intra_pred(TX_8X8, 8x8);
1116  init_intra_pred(TX_16X16, 16x16);
1117  init_intra_pred(TX_32X32, 32x32);
1118 
1119 #undef init_intra_pred
1120 #undef init_intra_pred_bd_aware
1121 }
1122 
1123 #define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
1124 static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
1125  ptrdiff_t stride, \
1126  int16_t *_block, int eob) \
1127 { \
1128  int i, j; \
1129  pixel *dst = (pixel *) _dst; \
1130  dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
1131 \
1132  stride /= sizeof(pixel); \
1133  if (has_dconly && eob == 1) { \
1134  const int t = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
1135  * 11585 + (1 << 13)) >> 14; \
1136  block[0] = 0; \
1137  for (i = 0; i < sz; i++) { \
1138  for (j = 0; j < sz; j++) \
1139  dst[j * stride] = av_clip_pixel(dst[j * stride] + \
1140  (bits ? \
1141  (int)(t + (1U << (bits - 1))) >> bits : \
1142  t)); \
1143  dst++; \
1144  } \
1145  return; \
1146  } \
1147 \
1148  for (i = 0; i < sz; i++) \
1149  type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
1150  memset(block, 0, sz * sz * sizeof(*block)); \
1151  for (i = 0; i < sz; i++) { \
1152  type_b##sz##_1d(tmp + i, sz, out, 1); \
1153  for (j = 0; j < sz; j++) \
1154  dst[j * stride] = av_clip_pixel(dst[j * stride] + \
1155  (bits ? \
1156  (int)(out[j] + (1U << (bits - 1))) >> bits : \
1157  out[j])); \
1158  dst++; \
1159  } \
1160 }
1161 
1162 #define itxfm_wrap(sz, bits) \
1163 itxfm_wrapper(idct, idct, sz, bits, 1) \
1164 itxfm_wrapper(iadst, idct, sz, bits, 0) \
1165 itxfm_wrapper(idct, iadst, sz, bits, 0) \
1166 itxfm_wrapper(iadst, iadst, sz, bits, 0)
1167 
1168 #define IN(x) ((dctint) in[(x) * stride])
1169 
1170 static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
1171  dctcoef *out, int pass)
1172 {
1173  dctint t0, t1, t2, t3;
1174 
1175  t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
1176  t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
1177  t2 = (IN(1) * 6270 - IN(3) * 15137 + (1 << 13)) >> 14;
1178  t3 = (IN(1) * 15137 + IN(3) * 6270 + (1 << 13)) >> 14;
1179 
1180  out[0] = t0 + t3;
1181  out[1] = t1 + t2;
1182  out[2] = t1 - t2;
1183  out[3] = t0 - t3;
1184 }
1185 
1186 static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
1187  dctcoef *out, int pass)
1188 {
1189  dctint t0, t1, t2, t3;
1190 
1191  t0 = 5283 * IN(0) + 15212 * IN(2) + 9929 * IN(3);
1192  t1 = 9929 * IN(0) - 5283 * IN(2) - 15212 * IN(3);
1193  t2 = 13377 * (IN(0) - IN(2) + IN(3));
1194  t3 = 13377 * IN(1);
1195 
1196  out[0] = (t0 + t3 + (1 << 13)) >> 14;
1197  out[1] = (t1 + t3 + (1 << 13)) >> 14;
1198  out[2] = (t2 + (1 << 13)) >> 14;
1199  out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
1200 }
1201 
1203 
1204 static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
1205  dctcoef *out, int pass)
1206 {
1207  dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
1208 
1209  t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
1210  t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
1211  t2a = (IN(2) * 6270 - IN(6) * 15137 + (1 << 13)) >> 14;
1212  t3a = (IN(2) * 15137 + IN(6) * 6270 + (1 << 13)) >> 14;
1213  t4a = (IN(1) * 3196 - IN(7) * 16069 + (1 << 13)) >> 14;
1214  t5a = (IN(5) * 13623 - IN(3) * 9102 + (1 << 13)) >> 14;
1215  t6a = (IN(5) * 9102 + IN(3) * 13623 + (1 << 13)) >> 14;
1216  t7a = (IN(1) * 16069 + IN(7) * 3196 + (1 << 13)) >> 14;
1217 
1218  t0 = t0a + t3a;
1219  t1 = t1a + t2a;
1220  t2 = t1a - t2a;
1221  t3 = t0a - t3a;
1222  t4 = t4a + t5a;
1223  t5a = t4a - t5a;
1224  t7 = t7a + t6a;
1225  t6a = t7a - t6a;
1226 
1227  t5 = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
1228  t6 = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
1229 
1230  out[0] = t0 + t7;
1231  out[1] = t1 + t6;
1232  out[2] = t2 + t5;
1233  out[3] = t3 + t4;
1234  out[4] = t3 - t4;
1235  out[5] = t2 - t5;
1236  out[6] = t1 - t6;
1237  out[7] = t0 - t7;
1238 }
1239 
1240 static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
1241  dctcoef *out, int pass)
1242 {
1243  dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
1244 
1245  t0a = 16305 * IN(7) + 1606 * IN(0);
1246  t1a = 1606 * IN(7) - 16305 * IN(0);
1247  t2a = 14449 * IN(5) + 7723 * IN(2);
1248  t3a = 7723 * IN(5) - 14449 * IN(2);
1249  t4a = 10394 * IN(3) + 12665 * IN(4);
1250  t5a = 12665 * IN(3) - 10394 * IN(4);
1251  t6a = 4756 * IN(1) + 15679 * IN(6);
1252  t7a = 15679 * IN(1) - 4756 * IN(6);
1253 
1254  t0 = (t0a + t4a + (1 << 13)) >> 14;
1255  t1 = (t1a + t5a + (1 << 13)) >> 14;
1256  t2 = (t2a + t6a + (1 << 13)) >> 14;
1257  t3 = (t3a + t7a + (1 << 13)) >> 14;
1258  t4 = (t0a - t4a + (1 << 13)) >> 14;
1259  t5 = (t1a - t5a + (1 << 13)) >> 14;
1260  t6 = (t2a - t6a + (1 << 13)) >> 14;
1261  t7 = (t3a - t7a + (1 << 13)) >> 14;
1262 
1263  t4a = 15137U * t4 + 6270U * t5;
1264  t5a = 6270U * t4 - 15137U * t5;
1265  t6a = 15137U * t7 - 6270U * t6;
1266  t7a = 6270U * t7 + 15137U * t6;
1267 
1268  out[0] = t0 + t2;
1269  out[7] = -(t1 + t3);
1270  t2 = t0 - t2;
1271  t3 = t1 - t3;
1272 
1273  out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
1274  out[6] = (dctint)((1U << 13) + t5a + t7a) >> 14;
1275  t6 = (dctint)((1U << 13) + t4a - t6a) >> 14;
1276  t7 = (dctint)((1U << 13) + t5a - t7a) >> 14;
1277 
1278  out[3] = -((dctint)((t2 + t3) * 11585U + (1 << 13)) >> 14);
1279  out[4] = (dctint)((t2 - t3) * 11585U + (1 << 13)) >> 14;
1280  out[2] = (dctint)((t6 + t7) * 11585U + (1 << 13)) >> 14;
1281  out[5] = -((dctint)((t6 - t7) * 11585U + (1 << 13)) >> 14);
1282 }
1283 
1285 
1286 static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
1287  dctcoef *out, int pass)
1288 {
1289  dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
1290  dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
1291  dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
1292 
1293  t0a = (dctint)((IN(0) + IN(8)) * 11585U + (1 << 13)) >> 14;
1294  t1a = (dctint)((IN(0) - IN(8)) * 11585U + (1 << 13)) >> 14;
1295  t2a = (dctint)(IN(4) * 6270U - IN(12) * 15137U + (1 << 13)) >> 14;
1296  t3a = (dctint)(IN(4) * 15137U + IN(12) * 6270U + (1 << 13)) >> 14;
1297  t4a = (dctint)(IN(2) * 3196U - IN(14) * 16069U + (1 << 13)) >> 14;
1298  t7a = (dctint)(IN(2) * 16069U + IN(14) * 3196U + (1 << 13)) >> 14;
1299  t5a = (dctint)(IN(10) * 13623U - IN(6) * 9102U + (1 << 13)) >> 14;
1300  t6a = (dctint)(IN(10) * 9102U + IN(6) * 13623U + (1 << 13)) >> 14;
1301  t8a = (dctint)(IN(1) * 1606U - IN(15) * 16305U + (1 << 13)) >> 14;
1302  t15a = (dctint)(IN(1) * 16305U + IN(15) * 1606U + (1 << 13)) >> 14;
1303  t9a = (dctint)(IN(9) * 12665U - IN(7) * 10394U + (1 << 13)) >> 14;
1304  t14a = (dctint)(IN(9) * 10394U + IN(7) * 12665U + (1 << 13)) >> 14;
1305  t10a = (dctint)(IN(5) * 7723U - IN(11) * 14449U + (1 << 13)) >> 14;
1306  t13a = (dctint)(IN(5) * 14449U + IN(11) * 7723U + (1 << 13)) >> 14;
1307  t11a = (dctint)(IN(13) * 15679U - IN(3) * 4756U + (1 << 13)) >> 14;
1308  t12a = (dctint)(IN(13) * 4756U + IN(3) * 15679U + (1 << 13)) >> 14;
1309 
1310  t0 = t0a + t3a;
1311  t1 = t1a + t2a;
1312  t2 = t1a - t2a;
1313  t3 = t0a - t3a;
1314  t4 = t4a + t5a;
1315  t5 = t4a - t5a;
1316  t6 = t7a - t6a;
1317  t7 = t7a + t6a;
1318  t8 = t8a + t9a;
1319  t9 = t8a - t9a;
1320  t10 = t11a - t10a;
1321  t11 = t11a + t10a;
1322  t12 = t12a + t13a;
1323  t13 = t12a - t13a;
1324  t14 = t15a - t14a;
1325  t15 = t15a + t14a;
1326 
1327  t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14;
1328  t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14;
1329  t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14;
1330  t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14;
1331  t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14;
1332  t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14;
1333 
1334  t0a = t0 + t7;
1335  t1a = t1 + t6a;
1336  t2a = t2 + t5a;
1337  t3a = t3 + t4;
1338  t4 = t3 - t4;
1339  t5 = t2 - t5a;
1340  t6 = t1 - t6a;
1341  t7 = t0 - t7;
1342  t8a = t8 + t11;
1343  t9 = t9a + t10a;
1344  t10 = t9a - t10a;
1345  t11a = t8 - t11;
1346  t12a = t15 - t12;
1347  t13 = t14a - t13a;
1348  t14 = t14a + t13a;
1349  t15a = t15 + t12;
1350 
1351  t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14;
1352  t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14;
1353  t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14;
1354  t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14;
1355 
1356  out[ 0] = t0a + t15a;
1357  out[ 1] = t1a + t14;
1358  out[ 2] = t2a + t13a;
1359  out[ 3] = t3a + t12;
1360  out[ 4] = t4 + t11;
1361  out[ 5] = t5 + t10a;
1362  out[ 6] = t6 + t9;
1363  out[ 7] = t7 + t8a;
1364  out[ 8] = t7 - t8a;
1365  out[ 9] = t6 - t9;
1366  out[10] = t5 - t10a;
1367  out[11] = t4 - t11;
1368  out[12] = t3a - t12;
1369  out[13] = t2a - t13a;
1370  out[14] = t1a - t14;
1371  out[15] = t0a - t15a;
1372 }
1373 
1374 static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
1375  dctcoef *out, int pass)
1376 {
1377  dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
1378  dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
1379  dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
1380 
1381  t0 = IN(15) * 16364U + IN(0) * 804U;
1382  t1 = IN(15) * 804U - IN(0) * 16364U;
1383  t2 = IN(13) * 15893U + IN(2) * 3981U;
1384  t3 = IN(13) * 3981U - IN(2) * 15893U;
1385  t4 = IN(11) * 14811U + IN(4) * 7005U;
1386  t5 = IN(11) * 7005U - IN(4) * 14811U;
1387  t6 = IN(9) * 13160U + IN(6) * 9760U;
1388  t7 = IN(9) * 9760U - IN(6) * 13160U;
1389  t8 = IN(7) * 11003U + IN(8) * 12140U;
1390  t9 = IN(7) * 12140U - IN(8) * 11003U;
1391  t10 = IN(5) * 8423U + IN(10) * 14053U;
1392  t11 = IN(5) * 14053U - IN(10) * 8423U;
1393  t12 = IN(3) * 5520U + IN(12) * 15426U;
1394  t13 = IN(3) * 15426U - IN(12) * 5520U;
1395  t14 = IN(1) * 2404U + IN(14) * 16207U;
1396  t15 = IN(1) * 16207U - IN(14) * 2404U;
1397 
1398  t0a = (dctint)((1U << 13) + t0 + t8 ) >> 14;
1399  t1a = (dctint)((1U << 13) + t1 + t9 ) >> 14;
1400  t2a = (dctint)((1U << 13) + t2 + t10) >> 14;
1401  t3a = (dctint)((1U << 13) + t3 + t11) >> 14;
1402  t4a = (dctint)((1U << 13) + t4 + t12) >> 14;
1403  t5a = (dctint)((1U << 13) + t5 + t13) >> 14;
1404  t6a = (dctint)((1U << 13) + t6 + t14) >> 14;
1405  t7a = (dctint)((1U << 13) + t7 + t15) >> 14;
1406  t8a = (dctint)((1U << 13) + t0 - t8 ) >> 14;
1407  t9a = (dctint)((1U << 13) + t1 - t9 ) >> 14;
1408  t10a = (dctint)((1U << 13) + t2 - t10) >> 14;
1409  t11a = (dctint)((1U << 13) + t3 - t11) >> 14;
1410  t12a = (dctint)((1U << 13) + t4 - t12) >> 14;
1411  t13a = (dctint)((1U << 13) + t5 - t13) >> 14;
1412  t14a = (dctint)((1U << 13) + t6 - t14) >> 14;
1413  t15a = (dctint)((1U << 13) + t7 - t15) >> 14;
1414 
1415  t8 = t8a * 16069U + t9a * 3196U;
1416  t9 = t8a * 3196U - t9a * 16069U;
1417  t10 = t10a * 9102U + t11a * 13623U;
1418  t11 = t10a * 13623U - t11a * 9102U;
1419  t12 = t13a * 16069U - t12a * 3196U;
1420  t13 = t13a * 3196U + t12a * 16069U;
1421  t14 = t15a * 9102U - t14a * 13623U;
1422  t15 = t15a * 13623U + t14a * 9102U;
1423 
1424  t0 = t0a + t4a;
1425  t1 = t1a + t5a;
1426  t2 = t2a + t6a;
1427  t3 = t3a + t7a;
1428  t4 = t0a - t4a;
1429  t5 = t1a - t5a;
1430  t6 = t2a - t6a;
1431  t7 = t3a - t7a;
1432  t8a = (dctint)((1U << 13) + t8 + t12) >> 14;
1433  t9a = (dctint)((1U << 13) + t9 + t13) >> 14;
1434  t10a = (dctint)((1U << 13) + t10 + t14) >> 14;
1435  t11a = (dctint)((1U << 13) + t11 + t15) >> 14;
1436  t12a = (dctint)((1U << 13) + t8 - t12) >> 14;
1437  t13a = (dctint)((1U << 13) + t9 - t13) >> 14;
1438  t14a = (dctint)((1U << 13) + t10 - t14) >> 14;
1439  t15a = (dctint)((1U << 13) + t11 - t15) >> 14;
1440 
1441  t4a = t4 * 15137U + t5 * 6270U;
1442  t5a = t4 * 6270U - t5 * 15137U;
1443  t6a = t7 * 15137U - t6 * 6270U;
1444  t7a = t7 * 6270U + t6 * 15137U;
1445  t12 = t12a * 15137U + t13a * 6270U;
1446  t13 = t12a * 6270U - t13a * 15137U;
1447  t14 = t15a * 15137U - t14a * 6270U;
1448  t15 = t15a * 6270U + t14a * 15137U;
1449 
1450  out[ 0] = t0 + t2;
1451  out[15] = -(t1 + t3);
1452  t2a = t0 - t2;
1453  t3a = t1 - t3;
1454  out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
1455  out[12] = (dctint)((1U << 13) + t5a + t7a) >> 14;
1456  t6 = (dctint)((1U << 13) + t4a - t6a) >> 14;
1457  t7 = (dctint)((1U << 13) + t5a - t7a) >> 14;
1458  out[ 1] = -(t8a + t10a);
1459  out[14] = t9a + t11a;
1460  t10 = t8a - t10a;
1461  t11 = t9a - t11a;
1462  out[ 2] = (dctint)((1U << 13) + t12 + t14) >> 14;
1463  out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14);
1464  t14a = (dctint)((1U << 13) + t12 - t14) >> 14;
1465  t15a = (dctint)((1U << 13) + t13 - t15) >> 14;
1466 
1467  out[ 7] = (dctint)(-(t2a + t3a) * 11585U + (1 << 13)) >> 14;
1468  out[ 8] = (dctint)( (t2a - t3a) * 11585U + (1 << 13)) >> 14;
1469  out[ 4] = (dctint)( (t7 + t6) * 11585U + (1 << 13)) >> 14;
1470  out[11] = (dctint)( (t7 - t6) * 11585U + (1 << 13)) >> 14;
1471  out[ 6] = (dctint)( (t11 + t10) * 11585U + (1 << 13)) >> 14;
1472  out[ 9] = (dctint)( (t11 - t10) * 11585U + (1 << 13)) >> 14;
1473  out[ 5] = (dctint)(-(t14a + t15a) * 11585U + (1 << 13)) >> 14;
1474  out[10] = (dctint)( (t14a - t15a) * 11585U + (1 << 13)) >> 14;
1475 }
1476 
1478 
1479 static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
1480  dctcoef *out, int pass)
1481 {
1482  dctint t0a = (dctint)((IN(0) + IN(16)) * 11585U + (1 << 13)) >> 14;
1483  dctint t1a = (dctint)((IN(0) - IN(16)) * 11585U + (1 << 13)) >> 14;
1484  dctint t2a = (dctint)(IN( 8) * 6270U - IN(24) * 15137U + (1 << 13)) >> 14;
1485  dctint t3a = (dctint)(IN( 8) * 15137U + IN(24) * 6270U + (1 << 13)) >> 14;
1486  dctint t4a = (dctint)(IN( 4) * 3196U - IN(28) * 16069U + (1 << 13)) >> 14;
1487  dctint t7a = (dctint)(IN( 4) * 16069U + IN(28) * 3196U + (1 << 13)) >> 14;
1488  dctint t5a = (dctint)(IN(20) * 13623U - IN(12) * 9102U + (1 << 13)) >> 14;
1489  dctint t6a = (dctint)(IN(20) * 9102U + IN(12) * 13623U + (1 << 13)) >> 14;
1490  dctint t8a = (dctint)(IN( 2) * 1606U - IN(30) * 16305U + (1 << 13)) >> 14;
1491  dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) * 1606U + (1 << 13)) >> 14;
1492  dctint t9a = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14;
1493  dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14;
1494  dctint t10a = (dctint)(IN(10) * 7723U - IN(22) * 14449U + (1 << 13)) >> 14;
1495  dctint t13a = (dctint)(IN(10) * 14449U + IN(22) * 7723U + (1 << 13)) >> 14;
1496  dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) * 4756U + (1 << 13)) >> 14;
1497  dctint t12a = (dctint)(IN(26) * 4756U + IN( 6) * 15679U + (1 << 13)) >> 14;
1498  dctint t16a = (dctint)(IN( 1) * 804U - IN(31) * 16364U + (1 << 13)) >> 14;
1499  dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) * 804U + (1 << 13)) >> 14;
1500  dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14;
1501  dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14;
1502  dctint t18a = (dctint)(IN( 9) * 7005U - IN(23) * 14811U + (1 << 13)) >> 14;
1503  dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) * 7005U + (1 << 13)) >> 14;
1504  dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) * 5520U + (1 << 13)) >> 14;
1505  dctint t28a = (dctint)(IN(25) * 5520U + IN( 7) * 15426U + (1 << 13)) >> 14;
1506  dctint t20a = (dctint)(IN( 5) * 3981U - IN(27) * 15893U + (1 << 13)) >> 14;
1507  dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) * 3981U + (1 << 13)) >> 14;
1508  dctint t21a = (dctint)(IN(21) * 14053U - IN(11) * 8423U + (1 << 13)) >> 14;
1509  dctint t26a = (dctint)(IN(21) * 8423U + IN(11) * 14053U + (1 << 13)) >> 14;
1510  dctint t22a = (dctint)(IN(13) * 9760U - IN(19) * 13160U + (1 << 13)) >> 14;
1511  dctint t25a = (dctint)(IN(13) * 13160U + IN(19) * 9760U + (1 << 13)) >> 14;
1512  dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) * 2404U + (1 << 13)) >> 14;
1513  dctint t24a = (dctint)(IN(29) * 2404U + IN( 3) * 16207U + (1 << 13)) >> 14;
1514 
1515  dctint t0 = t0a + t3a;
1516  dctint t1 = t1a + t2a;
1517  dctint t2 = t1a - t2a;
1518  dctint t3 = t0a - t3a;
1519  dctint t4 = t4a + t5a;
1520  dctint t5 = t4a - t5a;
1521  dctint t6 = t7a - t6a;
1522  dctint t7 = t7a + t6a;
1523  dctint t8 = t8a + t9a;
1524  dctint t9 = t8a - t9a;
1525  dctint t10 = t11a - t10a;
1526  dctint t11 = t11a + t10a;
1527  dctint t12 = t12a + t13a;
1528  dctint t13 = t12a - t13a;
1529  dctint t14 = t15a - t14a;
1530  dctint t15 = t15a + t14a;
1531  dctint t16 = t16a + t17a;
1532  dctint t17 = t16a - t17a;
1533  dctint t18 = t19a - t18a;
1534  dctint t19 = t19a + t18a;
1535  dctint t20 = t20a + t21a;
1536  dctint t21 = t20a - t21a;
1537  dctint t22 = t23a - t22a;
1538  dctint t23 = t23a + t22a;
1539  dctint t24 = t24a + t25a;
1540  dctint t25 = t24a - t25a;
1541  dctint t26 = t27a - t26a;
1542  dctint t27 = t27a + t26a;
1543  dctint t28 = t28a + t29a;
1544  dctint t29 = t28a - t29a;
1545  dctint t30 = t31a - t30a;
1546  dctint t31 = t31a + t30a;
1547 
1548  t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14;
1549  t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14;
1550  t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14;
1551  t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14;
1552  t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14;
1553  t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14;
1554  t17a = (dctint)( t30 * 3196U - t17 * 16069U + (1 << 13)) >> 14;
1555  t30a = (dctint)( t30 * 16069U + t17 * 3196U + (1 << 13)) >> 14;
1556  t18a = (dctint)(-(t29 * 16069U + t18 * 3196U) + (1 << 13)) >> 14;
1557  t29a = (dctint)( t29 * 3196U - t18 * 16069U + (1 << 13)) >> 14;
1558  t21a = (dctint)( t26 * 13623U - t21 * 9102U + (1 << 13)) >> 14;
1559  t26a = (dctint)( t26 * 9102U + t21 * 13623U + (1 << 13)) >> 14;
1560  t22a = (dctint)(-(t25 * 9102U + t22 * 13623U) + (1 << 13)) >> 14;
1561  t25a = (dctint)( t25 * 13623U - t22 * 9102U + (1 << 13)) >> 14;
1562 
1563  t0a = t0 + t7;
1564  t1a = t1 + t6a;
1565  t2a = t2 + t5a;
1566  t3a = t3 + t4;
1567  t4a = t3 - t4;
1568  t5 = t2 - t5a;
1569  t6 = t1 - t6a;
1570  t7a = t0 - t7;
1571  t8a = t8 + t11;
1572  t9 = t9a + t10a;
1573  t10 = t9a - t10a;
1574  t11a = t8 - t11;
1575  t12a = t15 - t12;
1576  t13 = t14a - t13a;
1577  t14 = t14a + t13a;
1578  t15a = t15 + t12;
1579  t16a = t16 + t19;
1580  t17 = t17a + t18a;
1581  t18 = t17a - t18a;
1582  t19a = t16 - t19;
1583  t20a = t23 - t20;
1584  t21 = t22a - t21a;
1585  t22 = t22a + t21a;
1586  t23a = t23 + t20;
1587  t24a = t24 + t27;
1588  t25 = t25a + t26a;
1589  t26 = t25a - t26a;
1590  t27a = t24 - t27;
1591  t28a = t31 - t28;
1592  t29 = t30a - t29a;
1593  t30 = t30a + t29a;
1594  t31a = t31 + t28;
1595 
1596  t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14;
1597  t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14;
1598  t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14;
1599  t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14;
1600  t18a = (dctint)( t29 * 6270U - t18 * 15137U + (1 << 13)) >> 14;
1601  t29a = (dctint)( t29 * 15137U + t18 * 6270U + (1 << 13)) >> 14;
1602  t19 = (dctint)( t28a * 6270U - t19a * 15137U + (1 << 13)) >> 14;
1603  t28 = (dctint)( t28a * 15137U + t19a * 6270U + (1 << 13)) >> 14;
1604  t20 = (dctint)(-(t27a * 15137U + t20a * 6270U) + (1 << 13)) >> 14;
1605  t27 = (dctint)( t27a * 6270U - t20a * 15137U + (1 << 13)) >> 14;
1606  t21a = (dctint)(-(t26 * 15137U + t21 * 6270U) + (1 << 13)) >> 14;
1607  t26a = (dctint)( t26 * 6270U - t21 * 15137U + (1 << 13)) >> 14;
1608 
1609  t0 = t0a + t15a;
1610  t1 = t1a + t14;
1611  t2 = t2a + t13a;
1612  t3 = t3a + t12;
1613  t4 = t4a + t11;
1614  t5a = t5 + t10a;
1615  t6a = t6 + t9;
1616  t7 = t7a + t8a;
1617  t8 = t7a - t8a;
1618  t9a = t6 - t9;
1619  t10 = t5 - t10a;
1620  t11a = t4a - t11;
1621  t12a = t3a - t12;
1622  t13 = t2a - t13a;
1623  t14a = t1a - t14;
1624  t15 = t0a - t15a;
1625  t16 = t16a + t23a;
1626  t17a = t17 + t22;
1627  t18 = t18a + t21a;
1628  t19a = t19 + t20;
1629  t20a = t19 - t20;
1630  t21 = t18a - t21a;
1631  t22a = t17 - t22;
1632  t23 = t16a - t23a;
1633  t24 = t31a - t24a;
1634  t25a = t30 - t25;
1635  t26 = t29a - t26a;
1636  t27a = t28 - t27;
1637  t28a = t28 + t27;
1638  t29 = t29a + t26a;
1639  t30a = t30 + t25;
1640  t31 = t31a + t24a;
1641 
1642  t20 = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14;
1643  t27 = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14;
1644  t21a = (dctint)((t26 - t21 ) * 11585U + (1 << 13)) >> 14;
1645  t26a = (dctint)((t26 + t21 ) * 11585U + (1 << 13)) >> 14;
1646  t22 = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14;
1647  t25 = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14;
1648  t23a = (dctint)((t24 - t23 ) * 11585U + (1 << 13)) >> 14;
1649  t24a = (dctint)((t24 + t23 ) * 11585U + (1 << 13)) >> 14;
1650 
1651  out[ 0] = t0 + t31;
1652  out[ 1] = t1 + t30a;
1653  out[ 2] = t2 + t29;
1654  out[ 3] = t3 + t28a;
1655  out[ 4] = t4 + t27;
1656  out[ 5] = t5a + t26a;
1657  out[ 6] = t6a + t25;
1658  out[ 7] = t7 + t24a;
1659  out[ 8] = t8 + t23a;
1660  out[ 9] = t9a + t22;
1661  out[10] = t10 + t21a;
1662  out[11] = t11a + t20;
1663  out[12] = t12a + t19a;
1664  out[13] = t13 + t18;
1665  out[14] = t14a + t17a;
1666  out[15] = t15 + t16;
1667  out[16] = t15 - t16;
1668  out[17] = t14a - t17a;
1669  out[18] = t13 - t18;
1670  out[19] = t12a - t19a;
1671  out[20] = t11a - t20;
1672  out[21] = t10 - t21a;
1673  out[22] = t9a - t22;
1674  out[23] = t8 - t23a;
1675  out[24] = t7 - t24a;
1676  out[25] = t6a - t25;
1677  out[26] = t5a - t26a;
1678  out[27] = t4 - t27;
1679  out[28] = t3 - t28a;
1680  out[29] = t2 - t29;
1681  out[30] = t1 - t30a;
1682  out[31] = t0 - t31;
1683 }
1684 
1686 
1687 static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
1688  dctcoef *out, int pass)
1689 {
1690  int t0, t1, t2, t3, t4;
1691 
1692  if (pass == 0) {
1693  t0 = IN(0) >> 2;
1694  t1 = IN(3) >> 2;
1695  t2 = IN(1) >> 2;
1696  t3 = IN(2) >> 2;
1697  } else {
1698  t0 = IN(0);
1699  t1 = IN(3);
1700  t2 = IN(1);
1701  t3 = IN(2);
1702  }
1703 
1704  t0 += t2;
1705  t3 -= t1;
1706  t4 = (t0 - t3) >> 1;
1707  t1 = t4 - t1;
1708  t2 = t4 - t2;
1709  t0 -= t1;
1710  t3 += t2;
1711 
1712  out[0] = t0;
1713  out[1] = t1;
1714  out[2] = t2;
1715  out[3] = t3;
1716 }
1717 
1718 itxfm_wrapper(iwht, iwht, 4, 0, 0)
1719 
1720 #undef IN
1721 #undef itxfm_wrapper
1722 #undef itxfm_wrap
1723 
1724 static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
1725 {
1726 #define init_itxfm(tx, sz) \
1727  dsp->itxfm_add[tx][DCT_DCT] = idct_idct_##sz##_add_c; \
1728  dsp->itxfm_add[tx][DCT_ADST] = iadst_idct_##sz##_add_c; \
1729  dsp->itxfm_add[tx][ADST_DCT] = idct_iadst_##sz##_add_c; \
1730  dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
1731 
1732 #define init_idct(tx, nm) \
1733  dsp->itxfm_add[tx][DCT_DCT] = \
1734  dsp->itxfm_add[tx][ADST_DCT] = \
1735  dsp->itxfm_add[tx][DCT_ADST] = \
1736  dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
1737 
1738  init_itxfm(TX_4X4, 4x4);
1739  init_itxfm(TX_8X8, 8x8);
1740  init_itxfm(TX_16X16, 16x16);
1741  init_idct(TX_32X32, idct_idct_32x32);
1742  init_idct(4 /* lossless */, iwht_iwht_4x4);
1743 
1744 #undef init_itxfm
1745 #undef init_idct
1746 }
1747 
1748 static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
1749  ptrdiff_t stridea, ptrdiff_t strideb,
1750  int wd)
1751 {
1752  int i, F = 1 << (BIT_DEPTH - 8);
1753 
1754  E <<= (BIT_DEPTH - 8);
1755  I <<= (BIT_DEPTH - 8);
1756  H <<= (BIT_DEPTH - 8);
1757  for (i = 0; i < 8; i++, dst += stridea) {
1758  int p7, p6, p5, p4;
1759  int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
1760  int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
1761  int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
1762  int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
1763  int q4, q5, q6, q7;
1764  int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
1765  FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
1766  FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
1767  FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
1768  int flat8out, flat8in;
1769 
1770  if (!fm)
1771  continue;
1772 
1773  if (wd >= 16) {
1774  p7 = dst[strideb * -8];
1775  p6 = dst[strideb * -7];
1776  p5 = dst[strideb * -6];
1777  p4 = dst[strideb * -5];
1778  q4 = dst[strideb * +4];
1779  q5 = dst[strideb * +5];
1780  q6 = dst[strideb * +6];
1781  q7 = dst[strideb * +7];
1782 
1783  flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
1784  FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
1785  FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
1786  FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
1787  }
1788 
1789  if (wd >= 8)
1790  flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
1791  FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
1792  FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
1793 
1794  if (wd >= 16 && flat8out && flat8in) {
1795  dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
1796  p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
1797  dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
1798  p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
1799  dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
1800  p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
1801  dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
1802  p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
1803  dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
1804  p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
1805  dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
1806  p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
1807  dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
1808  q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
1809  dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
1810  q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
1811  dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
1812  q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
1813  dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
1814  q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
1815  dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
1816  q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
1817  dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
1818  q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1819  dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
1820  q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1821  dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
1822  q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1823  } else if (wd >= 8 && flat8in) {
1824  dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
1825  dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
1826  dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
1827  dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
1828  dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
1829  dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
1830  } else {
1831  int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
1832 
1833  if (hev) {
1834  int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
1835  f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
1836 
1837  f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1838  f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1839 
1840  dst[strideb * -1] = av_clip_pixel(p0 + f2);
1841  dst[strideb * +0] = av_clip_pixel(q0 - f1);
1842  } else {
1843  int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
1844 
1845  f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1846  f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1847 
1848  dst[strideb * -1] = av_clip_pixel(p0 + f2);
1849  dst[strideb * +0] = av_clip_pixel(q0 - f1);
1850 
1851  f = (f1 + 1) >> 1;
1852  dst[strideb * -2] = av_clip_pixel(p1 + f);
1853  dst[strideb * +1] = av_clip_pixel(q1 - f);
1854  }
1855  }
1856  }
1857 }
1858 
1859 #define lf_8_fn(dir, wd, stridea, strideb) \
1860 static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
1861  ptrdiff_t stride, \
1862  int E, int I, int H) \
1863 { \
1864  pixel *dst = (pixel *) _dst; \
1865  stride /= sizeof(pixel); \
1866  loop_filter(dst, E, I, H, stridea, strideb, wd); \
1867 }
1868 
1869 #define lf_8_fns(wd) \
1870 lf_8_fn(h, wd, stride, 1) \
1871 lf_8_fn(v, wd, 1, stride)
1872 
1874 lf_8_fns(8)
1875 lf_8_fns(16)
1876 
1877 #undef lf_8_fn
1878 #undef lf_8_fns
1879 
1880 #define lf_16_fn(dir, stridea) \
1881 static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
1882  ptrdiff_t stride, \
1883  int E, int I, int H) \
1884 { \
1885  loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
1886  loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
1887 }
1888 
1889 lf_16_fn(h, stride)
1890 lf_16_fn(v, sizeof(pixel))
1891 
1892 #undef lf_16_fn
1893 
1894 #define lf_mix_fn(dir, wd1, wd2, stridea) \
1895 static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
1896  ptrdiff_t stride, \
1897  int E, int I, int H) \
1898 { \
1899  loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
1900  loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
1901 }
1902 
1903 #define lf_mix_fns(wd1, wd2) \
1904 lf_mix_fn(h, wd1, wd2, stride) \
1905 lf_mix_fn(v, wd1, wd2, sizeof(pixel))
1906 
1907 lf_mix_fns(4, 4)
1908 lf_mix_fns(4, 8)
1909 lf_mix_fns(8, 4)
1910 lf_mix_fns(8, 8)
1911 
1912 #undef lf_mix_fn
1913 #undef lf_mix_fns
1914 
1915 static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
1916 {
1917  dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
1918  dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
1919  dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
1920  dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
1921  dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
1922  dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
1923 
1924  dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
1925  dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
1926 
1927  dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
1928  dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
1929  dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
1930  dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
1931  dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
1932  dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
1933  dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
1934  dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
1935 }
1936 
1937 #if BIT_DEPTH != 12
1938 
1939 static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
1940  const uint8_t *src, ptrdiff_t src_stride,
1941  int w, int h)
1942 {
1943  do {
1944  memcpy(dst, src, w * sizeof(pixel));
1945 
1946  dst += dst_stride;
1947  src += src_stride;
1948  } while (--h);
1949 }
1950 
1951 static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride,
1952  const uint8_t *_src, ptrdiff_t src_stride,
1953  int w, int h)
1954 {
1955  pixel *dst = (pixel *) _dst;
1956  const pixel *src = (const pixel *) _src;
1957 
1958  dst_stride /= sizeof(pixel);
1959  src_stride /= sizeof(pixel);
1960  do {
1961  int x;
1962 
1963  for (x = 0; x < w; x += 4)
1964  AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
1965 
1966  dst += dst_stride;
1967  src += src_stride;
1968  } while (--h);
1969 }
1970 
1971 #define fpel_fn(type, sz) \
1972 static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
1973  const uint8_t *src, ptrdiff_t src_stride, \
1974  int h, int mx, int my) \
1975 { \
1976  type##_c(dst, dst_stride, src, src_stride, sz, h); \
1977 }
1978 
1979 #define copy_avg_fn(sz) \
1980 fpel_fn(copy, sz) \
1981 fpel_fn(avg, sz)
1982 
1984 copy_avg_fn(32)
1985 copy_avg_fn(16)
1986 copy_avg_fn(8)
1987 copy_avg_fn(4)
1988 
1989 #undef fpel_fn
1990 #undef copy_avg_fn
1991 
1992 #endif /* BIT_DEPTH != 12 */
1993 
1994 #define FILTER_8TAP(src, x, F, stride) \
1995  av_clip_pixel((F[0] * src[x + -3 * stride] + \
1996  F[1] * src[x + -2 * stride] + \
1997  F[2] * src[x + -1 * stride] + \
1998  F[3] * src[x + +0 * stride] + \
1999  F[4] * src[x + +1 * stride] + \
2000  F[5] * src[x + +2 * stride] + \
2001  F[6] * src[x + +3 * stride] + \
2002  F[7] * src[x + +4 * stride] + 64) >> 7)
2003 
2004 static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2005  const uint8_t *_src, ptrdiff_t src_stride,
2006  int w, int h, ptrdiff_t ds,
2007  const int16_t *filter, int avg)
2008 {
2009  pixel *dst = (pixel *) _dst;
2010  const pixel *src = (const pixel *) _src;
2011 
2012  dst_stride /= sizeof(pixel);
2013  src_stride /= sizeof(pixel);
2014  do {
2015  int x;
2016 
2017  for (x = 0; x < w; x++)
2018  if (avg) {
2019  dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
2020  } else {
2021  dst[x] = FILTER_8TAP(src, x, filter, ds);
2022  }
2023 
2024  dst += dst_stride;
2025  src += src_stride;
2026  } while (--h);
2027 }
2028 
2029 #define filter_8tap_1d_fn(opn, opa, dir, ds) \
2030 static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2031  const uint8_t *src, ptrdiff_t src_stride, \
2032  int w, int h, const int16_t *filter) \
2033 { \
2034  do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
2035 }
2036 
2037 filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
2039 filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
2040 filter_8tap_1d_fn(avg, 1, h, 1)
2041 
2042 #undef filter_8tap_1d_fn
2043 
2044 static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2045  const uint8_t *_src, ptrdiff_t src_stride,
2046  int w, int h, const int16_t *filterx,
2047  const int16_t *filtery, int avg)
2048 {
2049  int tmp_h = h + 7;
2050  pixel tmp[64 * 71], *tmp_ptr = tmp;
2051  pixel *dst = (pixel *) _dst;
2052  const pixel *src = (const pixel *) _src;
2053 
2054  dst_stride /= sizeof(pixel);
2055  src_stride /= sizeof(pixel);
2056  src -= src_stride * 3;
2057  do {
2058  int x;
2059 
2060  for (x = 0; x < w; x++)
2061  tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
2062 
2063  tmp_ptr += 64;
2064  src += src_stride;
2065  } while (--tmp_h);
2066 
2067  tmp_ptr = tmp + 64 * 3;
2068  do {
2069  int x;
2070 
2071  for (x = 0; x < w; x++)
2072  if (avg) {
2073  dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
2074  } else {
2075  dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
2076  }
2077 
2078  tmp_ptr += 64;
2079  dst += dst_stride;
2080  } while (--h);
2081 }
2082 
2083 #define filter_8tap_2d_fn(opn, opa) \
2084 static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2085  const uint8_t *src, ptrdiff_t src_stride, \
2086  int w, int h, const int16_t *filterx, \
2087  const int16_t *filtery) \
2088 { \
2089  do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
2090 }
2091 
2094 
2095 #undef filter_8tap_2d_fn
2096 
2097 #define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
2098 static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2099  const uint8_t *src, ptrdiff_t src_stride, \
2100  int h, int mx, int my) \
2101 { \
2102  avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
2103  ff_vp9_subpel_filters[type_idx][dir_m]); \
2104 }
2105 
2106 #define filter_fn_2d(sz, type, type_idx, avg) \
2107 static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2108  const uint8_t *src, ptrdiff_t src_stride, \
2109  int h, int mx, int my) \
2110 { \
2111  avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
2112  ff_vp9_subpel_filters[type_idx][mx], \
2113  ff_vp9_subpel_filters[type_idx][my]); \
2114 }
2115 
2116 #if BIT_DEPTH != 12
2117 
2118 #define FILTER_BILIN(src, x, mxy, stride) \
2119  (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
2120 
2121 static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2122  const uint8_t *_src, ptrdiff_t src_stride,
2123  int w, int h, ptrdiff_t ds, int mxy, int avg)
2124 {
2125  pixel *dst = (pixel *) _dst;
2126  const pixel *src = (const pixel *) _src;
2127 
2128  dst_stride /= sizeof(pixel);
2129  src_stride /= sizeof(pixel);
2130  do {
2131  int x;
2132 
2133  for (x = 0; x < w; x++)
2134  if (avg) {
2135  dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
2136  } else {
2137  dst[x] = FILTER_BILIN(src, x, mxy, ds);
2138  }
2139 
2140  dst += dst_stride;
2141  src += src_stride;
2142  } while (--h);
2143 }
2144 
2145 #define bilin_1d_fn(opn, opa, dir, ds) \
2146 static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2147  const uint8_t *src, ptrdiff_t src_stride, \
2148  int w, int h, int mxy) \
2149 { \
2150  do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
2151 }
2152 
2153 bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
2154 bilin_1d_fn(put, 0, h, 1)
2155 bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
2156 bilin_1d_fn(avg, 1, h, 1)
2157 
2158 #undef bilin_1d_fn
2159 
2160 static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2161  const uint8_t *_src, ptrdiff_t src_stride,
2162  int w, int h, int mx, int my, int avg)
2163 {
2164  pixel tmp[64 * 65], *tmp_ptr = tmp;
2165  int tmp_h = h + 1;
2166  pixel *dst = (pixel *) _dst;
2167  const pixel *src = (const pixel *) _src;
2168 
2169  dst_stride /= sizeof(pixel);
2170  src_stride /= sizeof(pixel);
2171  do {
2172  int x;
2173 
2174  for (x = 0; x < w; x++)
2175  tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
2176 
2177  tmp_ptr += 64;
2178  src += src_stride;
2179  } while (--tmp_h);
2180 
2181  tmp_ptr = tmp;
2182  do {
2183  int x;
2184 
2185  for (x = 0; x < w; x++)
2186  if (avg) {
2187  dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
2188  } else {
2189  dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
2190  }
2191 
2192  tmp_ptr += 64;
2193  dst += dst_stride;
2194  } while (--h);
2195 }
2196 
2197 #define bilin_2d_fn(opn, opa) \
2198 static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2199  const uint8_t *src, ptrdiff_t src_stride, \
2200  int w, int h, int mx, int my) \
2201 { \
2202  do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
2203 }
2204 
2206 bilin_2d_fn(avg, 1)
2207 
2208 #undef bilin_2d_fn
2209 
2210 #define bilinf_fn_1d(sz, dir, dir_m, avg) \
2211 static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2212  const uint8_t *src, ptrdiff_t src_stride, \
2213  int h, int mx, int my) \
2214 { \
2215  avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
2216 }
2217 
2218 #define bilinf_fn_2d(sz, avg) \
2219 static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2220  const uint8_t *src, ptrdiff_t src_stride, \
2221  int h, int mx, int my) \
2222 { \
2223  avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
2224 }
2225 
2226 #else
2227 
2228 #define bilinf_fn_1d(a, b, c, d)
2229 #define bilinf_fn_2d(a, b)
2230 
2231 #endif
2232 
2233 #define filter_fn(sz, avg) \
2234 filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
2235 filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
2236 filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg) \
2237 filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg) \
2238 filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg) \
2239 filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg) \
2240 filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg) \
2241 filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg) \
2242 filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg) \
2243 bilinf_fn_1d(sz, h, mx, avg) \
2244 bilinf_fn_1d(sz, v, my, avg) \
2245 bilinf_fn_2d(sz, avg)
2246 
2247 #define filter_fn_set(avg) \
2248 filter_fn(64, avg) \
2249 filter_fn(32, avg) \
2250 filter_fn(16, avg) \
2251 filter_fn(8, avg) \
2252 filter_fn(4, avg)
2253 
2254 filter_fn_set(put)
2256 
2257 #undef filter_fn
2258 #undef filter_fn_set
2259 #undef filter_fn_1d
2260 #undef filter_fn_2d
2261 #undef bilinf_fn_1d
2262 #undef bilinf_fn_2d
2263 
2264 #if BIT_DEPTH != 8
2265 void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
2266 #endif
2267 #if BIT_DEPTH != 10
2268 static
2269 #endif
2270 av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
2271 {
2272 #if BIT_DEPTH == 12
2273  ff_vp9dsp_mc_init_10(dsp);
2274 #else /* BIT_DEPTH == 12 */
2275 
2276 #define init_fpel(idx1, idx2, sz, type) \
2277  dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
2278  dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
2279  dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = type##sz##_c; \
2280  dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = type##sz##_c
2281 
2282 #define init_copy_avg(idx, sz) \
2283  init_fpel(idx, 0, sz, copy); \
2284  init_fpel(idx, 1, sz, avg)
2285 
2286  init_copy_avg(0, 64);
2287  init_copy_avg(1, 32);
2288  init_copy_avg(2, 16);
2289  init_copy_avg(3, 8);
2290  init_copy_avg(4, 4);
2291 
2292 #undef init_copy_avg
2293 #undef init_fpel
2294 
2295 #endif /* BIT_DEPTH == 12 */
2296 
2297 #define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
2298  dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
2299  dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
2300  dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
2301 
2302 #if BIT_DEPTH == 12
2303 #define init_subpel1 init_subpel1_bd_aware
2304 #else
2305 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
2306  init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
2307  dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
2308 #endif
2309 
2310 #define init_subpel2(idx, idxh, idxv, dir, type) \
2311  init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
2312  init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
2313  init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
2314  init_subpel1(3, idx, idxh, idxv, 8, dir, type); \
2315  init_subpel1(4, idx, idxh, idxv, 4, dir, type)
2316 
2317 #define init_subpel3(idx, type) \
2318  init_subpel2(idx, 1, 1, hv, type); \
2319  init_subpel2(idx, 0, 1, v, type); \
2320  init_subpel2(idx, 1, 0, h, type)
2321 
2322  init_subpel3(0, put);
2323  init_subpel3(1, avg);
2324 
2325 #undef init_subpel1
2326 #undef init_subpel2
2327 #undef init_subpel3
2328 #undef init_subpel1_bd_aware
2329 }
2330 
2331 static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
2332  const uint8_t *_src, ptrdiff_t src_stride,
2333  int w, int h, int mx, int my,
2334  int dx, int dy, int avg,
2335  const int16_t (*filters)[8])
2336 {
2337  int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
2338  pixel tmp[64 * 135], *tmp_ptr = tmp;
2339  pixel *dst = (pixel *) _dst;
2340  const pixel *src = (const pixel *) _src;
2341 
2342  dst_stride /= sizeof(pixel);
2343  src_stride /= sizeof(pixel);
2344  src -= src_stride * 3;
2345  do {
2346  int x;
2347  int imx = mx, ioff = 0;
2348 
2349  for (x = 0; x < w; x++) {
2350  tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
2351  imx += dx;
2352  ioff += imx >> 4;
2353  imx &= 0xf;
2354  }
2355 
2356  tmp_ptr += 64;
2357  src += src_stride;
2358  } while (--tmp_h);
2359 
2360  tmp_ptr = tmp + 64 * 3;
2361  do {
2362  int x;
2363  const int16_t *filter = filters[my];
2364 
2365  for (x = 0; x < w; x++)
2366  if (avg) {
2367  dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
2368  } else {
2369  dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
2370  }
2371 
2372  my += dy;
2373  tmp_ptr += (my >> 4) * 64;
2374  my &= 0xf;
2375  dst += dst_stride;
2376  } while (--h);
2377 }
2378 
2379 #define scaled_filter_8tap_fn(opn, opa) \
2380 static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
2381  const uint8_t *src, ptrdiff_t src_stride, \
2382  int w, int h, int mx, int my, int dx, int dy, \
2383  const int16_t (*filters)[8]) \
2384 { \
2385  do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
2386  opa, filters); \
2387 }
2388 
2391 
2392 #undef scaled_filter_8tap_fn
2393 
2394 #undef FILTER_8TAP
2395 
2396 #define scaled_filter_fn(sz, type, type_idx, avg) \
2397 static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2398  const uint8_t *src, ptrdiff_t src_stride, \
2399  int h, int mx, int my, int dx, int dy) \
2400 { \
2401  avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
2402  ff_vp9_subpel_filters[type_idx]); \
2403 }
2404 
2405 #if BIT_DEPTH != 12
2406 
2407 static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
2408  const uint8_t *_src, ptrdiff_t src_stride,
2409  int w, int h, int mx, int my,
2410  int dx, int dy, int avg)
2411 {
2412  pixel tmp[64 * 129], *tmp_ptr = tmp;
2413  int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
2414  pixel *dst = (pixel *) _dst;
2415  const pixel *src = (const pixel *) _src;
2416 
2417  dst_stride /= sizeof(pixel);
2418  src_stride /= sizeof(pixel);
2419  do {
2420  int x;
2421  int imx = mx, ioff = 0;
2422 
2423  for (x = 0; x < w; x++) {
2424  tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
2425  imx += dx;
2426  ioff += imx >> 4;
2427  imx &= 0xf;
2428  }
2429 
2430  tmp_ptr += 64;
2431  src += src_stride;
2432  } while (--tmp_h);
2433 
2434  tmp_ptr = tmp;
2435  do {
2436  int x;
2437 
2438  for (x = 0; x < w; x++)
2439  if (avg) {
2440  dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
2441  } else {
2442  dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
2443  }
2444 
2445  my += dy;
2446  tmp_ptr += (my >> 4) * 64;
2447  my &= 0xf;
2448  dst += dst_stride;
2449  } while (--h);
2450 }
2451 
2452 #define scaled_bilin_fn(opn, opa) \
2453 static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
2454  const uint8_t *src, ptrdiff_t src_stride, \
2455  int w, int h, int mx, int my, int dx, int dy) \
2456 { \
2457  do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
2458 }
2459 
2461 scaled_bilin_fn(avg, 1)
2462 
2463 #undef scaled_bilin_fn
2464 
2465 #undef FILTER_BILIN
2466 
2467 #define scaled_bilinf_fn(sz, avg) \
2468 static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2469  const uint8_t *src, ptrdiff_t src_stride, \
2470  int h, int mx, int my, int dx, int dy) \
2471 { \
2472  avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
2473 }
2474 
2475 #else
2476 
2477 #define scaled_bilinf_fn(a, b)
2478 
2479 #endif
2480 
2481 #define scaled_filter_fns(sz, avg) \
2482 scaled_filter_fn(sz, regular, FILTER_8TAP_REGULAR, avg) \
2483 scaled_filter_fn(sz, smooth, FILTER_8TAP_SMOOTH, avg) \
2484 scaled_filter_fn(sz, sharp, FILTER_8TAP_SHARP, avg) \
2485 scaled_bilinf_fn(sz, avg)
2486 
2487 #define scaled_filter_fn_set(avg) \
2488 scaled_filter_fns(64, avg) \
2489 scaled_filter_fns(32, avg) \
2490 scaled_filter_fns(16, avg) \
2491 scaled_filter_fns(8, avg) \
2492 scaled_filter_fns(4, avg)
2493 
2496 
2497 #undef scaled_filter_fns
2498 #undef scaled_filter_fn_set
2499 #undef scaled_filter_fn
2500 #undef scaled_bilinf_fn
2501 
2502 #if BIT_DEPTH != 8
2503 void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
2504 #endif
2505 #if BIT_DEPTH != 10
2506 static
2507 #endif
2508 av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
2509 {
2510 #define init_scaled_bd_aware(idx1, idx2, sz, type) \
2511  dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
2512  dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
2513  dsp->smc[idx1][FILTER_8TAP_SHARP ][idx2] = type##_scaled_sharp_##sz##_c
2514 
2515 #if BIT_DEPTH == 12
2516  ff_vp9dsp_scaled_mc_init_10(dsp);
2517 #define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
2518 #else
2519 #define init_scaled(idx1, idx2, sz, type) \
2520  init_scaled_bd_aware(idx1, idx2, sz, type); \
2521  dsp->smc[idx1][FILTER_BILINEAR ][idx2] = type##_scaled_bilin_##sz##_c
2522 #endif
2523 
2524 #define init_scaled_put_avg(idx, sz) \
2525  init_scaled(idx, 0, sz, put); \
2526  init_scaled(idx, 1, sz, avg)
2527 
2528  init_scaled_put_avg(0, 64);
2529  init_scaled_put_avg(1, 32);
2530  init_scaled_put_avg(2, 16);
2531  init_scaled_put_avg(3, 8);
2532  init_scaled_put_avg(4, 4);
2533 
2534 #undef init_scaled_put_avg
2535 #undef init_scaled
2536 #undef init_scaled_bd_aware
2537 }
2538 
2540 {
2541  FUNC(ff_vp9dsp_intrapred_init)(dsp);
2542  vp9dsp_itxfm_init(dsp);
2543  vp9dsp_loopfilter_init(dsp);
2544  FUNC(ff_vp9dsp_mc_init)(dsp);
2545  FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
2546 }
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
FILTER_8TAP
#define FILTER_8TAP(src, x, F, stride)
BIT_DEPTH
#define BIT_DEPTH
Definition: bit_depth_template.c:24
out
FILE * out
Definition: movenc.c:54
n
int n
Definition: avisynth_c.h:760
init_intra_pred
#define init_intra_pred(tx, sz)
dc_top_32x32_c
static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:541
filters
static const struct PPFilter filters[]
Definition: postprocess.c:134
idct4_1d
static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1170
init_idct
#define init_idct(tx, nm)
lf_16_fn
#define lf_16_fn(dir, stridea)
init_copy_avg
#define init_copy_avg(idx, sz)
AV_RN4P
#define AV_RN4P
Definition: bit_depth_template.c:91
VP9DSPContext::loop_filter_8
void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:80
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
w
uint8_t w
Definition: llviddspenc.c:38
lf_mix_fns
#define lf_mix_fns(wd1, wd2)
VP9DSPContext
Definition: vp9dsp.h:39
t0
#define t0
Definition: regdef.h:28
F
#define F(x)
hor_8x8_c
static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:127
t1
#define t1
Definition: regdef.h:29
tm_32x32_c
static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:264
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
init_itxfm
#define init_itxfm(tx, sz)
def_diag_downleft
#define def_diag_downleft(size)
Definition: vp9dsp_template.c:801
t10
#define t10
Definition: regdef.h:55
pixel4
#define pixel4
Definition: bit_depth_template.c:83
memset_bpc
static void memset_bpc(uint16_t *dst, int val, int len)
Definition: vp9dsp_template.c:773
diag_downleft_4x4_c
static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:783
VP9DSPContext::loop_filter_mix2
void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:102
U
#define U(x)
Definition: vp56_arith.h:37
loop_filter
static av_always_inline void loop_filter(pixel *dst, int E, int I, int H, ptrdiff_t stridea, ptrdiff_t strideb, int wd)
Definition: vp9dsp_template.c:1748
dc_left_32x32_c
static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:460
lf_8_fns
#define lf_8_fns(wd)
Definition: vp9dsp_template.c:1869
dctcoef
#define dctcoef
Definition: bit_depth_template.c:84
iadst8_1d
static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1240
do_bilin_2d_c
static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h, int mx, int my, int avg)
Definition: vp9dsp_template.c:2160
src
#define src
Definition: vp8dsp.c:254
idct
static void idct(int16_t block[64])
Definition: 4xm.c:163
copy_c
static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int w, int h)
Definition: vp9dsp_template.c:1939
rnd_avg_pixel4
#define rnd_avg_pixel4
Definition: bit_depth_template.c:89
dctint
#define dctint
Definition: vp9dsp_10bpp.c:25
itxfm_wrapper
#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly)
Definition: vp9dsp_template.c:1123
a1
#define a1
Definition: regdef.h:47
def_hor_up
#define def_hor_up(size)
Definition: vp9dsp_template.c:1046
av_cold
#define av_cold
Definition: attributes.h:84
tm_4x4_c
static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:188
t15
static int t15(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:325
vert_4x4_c
static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:33
AV_WN4PA
#define AV_WN4PA
Definition: bit_depth_template.c:95
dc_129_16x16_c
static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:730
dc_127_4x4_c
static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:637
scaled_bilin_fn
#define scaled_bilin_fn(opn, opa)
Definition: vp9dsp_template.c:2452
t7
#define t7
Definition: regdef.h:35
a4
#define a4
Definition: regdef.h:50
dc_128_32x32_c
static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:616
PIXEL_SPLAT_X4
#define PIXEL_SPLAT_X4(x)
Definition: bit_depth_template.c:96
f
#define f(width, name)
Definition: cbs_vp9.c:255
pass
#define pass
Definition: fft_template.c:619
tm_16x16_c
static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:232
q0
static const uint8_t q0[256]
Definition: twofish.c:77
E
#define E
Definition: avdct.c:32
vert_32x32_c
static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:85
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
do_scaled_8tap_c
static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h, int mx, int my, int dx, int dy, int avg, const int16_t(*filters)[8])
Definition: vp9dsp_template.c:2331
tm_8x8_c
static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:208
FILTER_BILIN
#define FILTER_BILIN(src, x, mxy, stride)
t5
#define t5
Definition: regdef.h:33
pixel
uint8_t pixel
Definition: tiny_ssim.c:42
avg_c
static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h)
Definition: vp9dsp_template.c:1951
t6
#define t6
Definition: regdef.h:34
TX_8X8
@ TX_8X8
Definition: vp9.h:29
H
F H1 F F H1 F F F F H1<-F-------F-------F v v v H2 H3 H2 ^ ^ ^ F-------F-------F-> H1<-F-------F-------F|||||||||F H1 F|||||||||F H1 Funavailable fullpel samples(outside the picture for example) shall be equalto the closest available fullpel sampleSmaller pel interpolation:--------------------------if diag_mc is set then points which lie on a line between 2 vertically, horizontally or diagonally adjacent halfpel points shall be interpolatedlinearly with rounding to nearest and halfway values rounded up.points which lie on 2 diagonals at the same time should only use the onediagonal not containing the fullpel point F--> O q O<--h1-> O q O<--F v \/v \/v O O O O O O O|/|\|q q q q q|/|\|O O O O O O O ^/\ ^/\ ^ h2--> O q O<--h3-> O q O<--h2 v \/v \/v O O O O O O O|\|/|q q q q q|\|/|O O O O O O O ^/\ ^/\ ^ F--> O q O<--h1-> O q O<--Fthe remaining points shall be bilinearly interpolated from theup to 4 surrounding halfpel and fullpel points, again rounding should be tonearest and halfway values rounded upcompliant Snow decoders MUST support 1-1/8 pel luma and 1/2-1/16 pel chromainterpolation at leastOverlapped block motion compensation:-------------------------------------FIXMELL band prediction:===================Each sample in the LL0 subband is predicted by the median of the left, top andleft+top-topleft samples, samples outside the subband shall be considered tobe 0. To reverse this prediction in the decoder apply the following.for(y=0;y< height;y++){ for(x=0;x< width;x++){ sample[y][x]+=median(sample[y-1][x], sample[y][x-1], sample[y-1][x]+sample[y][x-1]-sample[y-1][x-1]);}}sample[-1][ *]=sample[ *][-1]=0;width, height here are the width and height of the LL0 subband not of the finalvideoDequantization:===============FIXMEWavelet Transform:==================Snow supports 2 wavelet transforms, the symmetric biorthogonal 5/3 integertransform and an integer approximation of the symmetric biorthogonal 9/7daubechies wavelet.2D IDWT(inverse discrete wavelet transform) --------------------------------------------The 2D IDWT applies a 2D filter recursively, each time combining the4 lowest frequency subbands into a single subband until only 1 subbandremains.The 2D filter is done by first applying a 1D filter in the vertical directionand then applying it in the horizontal one. --------------- --------------- --------------- ---------------|LL0|HL0|||||||||||||---+---|HL1||L0|H0|HL1||LL1|HL1|||||LH0|HH0|||||||||||||-------+-------|-> L1 H1 LH1 HH1 LH1 HH1 LH1 HH1 this can end with a L or a H
Definition: snow.txt:555
dc_127_32x32_c
static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:681
dc_16x16_c
static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:350
bit_depth_template.c
hor_32x32_c
static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:163
dc_127_8x8_c
static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:649
TX_16X16
@ TX_16X16
Definition: vp9.h:30
dc_left_8x8_c
static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:421
dc_128_8x8_c
static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:584
dc_4x4_c
static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:314
filter_8tap_1d_fn
#define filter_8tap_1d_fn(opn, opa, dir, ds)
Definition: vp9dsp_template.c:2029
vp9dsp.h
t11
#define t11
Definition: regdef.h:56
init_scaled_put_avg
#define init_scaled_put_avg(idx, sz)
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
def_hor_down
#define def_hor_down(size)
Definition: vp9dsp_template.c:950
dc_8x8_c
static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:330
t12
#define t12
Definition: regdef.h:58
t27
static int t27(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:354
dc_129_8x8_c
static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:715
AV_RN4PA
#define AV_RN4PA
Definition: bit_depth_template.c:92
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
TX_4X4
@ TX_4X4
Definition: vp9.h:28
t8
#define t8
Definition: regdef.h:53
val
const char const char void * val
Definition: avisynth_c.h:863
idct16_1d
static void av_always_inline idct16_1d(float *dst, const float *src, int dst_stridea, int dst_strideb, int src_stridea, int src_strideb, int add)
Definition: vf_dctdnoiz.c:257
ff_vp9dsp_init
av_cold void FUNC() ff_vp9dsp_init(VP9DSPContext *dsp)
Definition: vp9dsp_template.c:2539
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
IN
#define IN(x)
Definition: vp9dsp_template.c:1168
a0
#define a0
Definition: regdef.h:46
iadst16_1d
static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1374
dc_top_16x16_c
static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:520
dc_128_4x4_c
static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:571
dc_top_8x8_c
static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:502
in
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
Definition: audio_convert.c:326
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
iadst4_1d
static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1186
a2
#define a2
Definition: regdef.h:48
common.h
av_always_inline
#define av_always_inline
Definition: attributes.h:43
scaled_filter_fn_set
#define scaled_filter_fn_set(avg)
uint8_t
uint8_t
Definition: audio_convert.c:194
len
int len
Definition: vorbis_enc_data.h:452
dc_129_4x4_c
static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:702
def_diag_downright
#define def_diag_downright(size)
Definition: vp9dsp_template.c:844
hor_4x4_c
static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:114
dc_left_4x4_c
static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:407
av_clip_pixel
#define av_clip_pixel(a)
Definition: bit_depth_template.c:98
dc_128_16x16_c
static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:599
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
copy_avg_fn
#define copy_avg_fn(sz)
Definition: vp9dsp_template.c:1979
dc_left_16x16_c
static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:439
a5
#define a5
Definition: regdef.h:51
vert_8x8_c
static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:47
idct8_1d
static void av_always_inline idct8_1d(float *dst, const float *src, int dst_stridea, int dst_strideb, int src_stridea, int src_strideb, int add)
Definition: vf_dctdnoiz.c:129
t2
#define t2
Definition: regdef.h:30
FUNC
#define FUNC(a)
Definition: bit_depth_template.c:104
bilin_1d_fn
#define bilin_1d_fn(opn, opa, dir, ds)
Definition: vp9dsp_template.c:2145
filter_8tap_2d_fn
#define filter_8tap_2d_fn(opn, opa)
Definition: vp9dsp_template.c:2083
t9
#define t9
Definition: regdef.h:54
filter_fn_set
#define filter_fn_set(avg)
DST
#define DST(x, y)
Definition: vp9dsp_template.c:781
itxfm_wrap
#define itxfm_wrap(sz, bits)
Definition: vp9dsp_template.c:1162
TX_32X32
@ TX_32X32
Definition: vp9.h:31
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:729
dc_top_4x4_c
static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:488
scaled_filter_8tap_fn
#define scaled_filter_8tap_fn(opn, opa)
Definition: vp9dsp_template.c:2379
dc_127_16x16_c
static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:664
init_subpel3
#define init_subpel3(idx, type)
def_vert_left
#define def_vert_left(size)
Definition: vp9dsp_template.c:1000
h
h
Definition: vp9dsp_template.c:2038
VP9DSPContext::loop_filter_16
void(* loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:88
dc_32x32_c
static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:374
def_vert_right
#define def_vert_right(size)
Definition: vp9dsp_template.c:893
hor_16x16_c
static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:144
do_8tap_2d_c
static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h, const int16_t *filterx, const int16_t *filtery, int avg)
Definition: vp9dsp_template.c:2044
vert_16x16_c
static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:64
a3
#define a3
Definition: regdef.h:49
bilin_2d_fn
#define bilin_2d_fn(opn, opa)
Definition: vp9dsp_template.c:2197
dc_129_32x32_c
static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:747