FFmpeg
hevcdsp_init.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013 Seppo Tomperi
3  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "config.h"
23 
24 #include "libavutil/cpu.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/hevcdsp.h"
29 #include "libavcodec/x86/hevcdsp.h"
30 
31 #define LFC_FUNC(DIR, DEPTH, OPT) \
32 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
33 
34 #define LFL_FUNC(DIR, DEPTH, OPT) \
35 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
36 
37 #define LFC_FUNCS(type, depth, opt) \
38  LFC_FUNC(h, depth, opt) \
39  LFC_FUNC(v, depth, opt)
40 
41 #define LFL_FUNCS(type, depth, opt) \
42  LFL_FUNC(h, depth, opt) \
43  LFL_FUNC(v, depth, opt)
44 
45 LFC_FUNCS(uint8_t, 8, sse2)
46 LFC_FUNCS(uint8_t, 10, sse2)
47 LFC_FUNCS(uint8_t, 12, sse2)
48 LFC_FUNCS(uint8_t, 8, avx)
49 LFC_FUNCS(uint8_t, 10, avx)
50 LFC_FUNCS(uint8_t, 12, avx)
51 LFL_FUNCS(uint8_t, 8, sse2)
52 LFL_FUNCS(uint8_t, 10, sse2)
53 LFL_FUNCS(uint8_t, 12, sse2)
54 LFL_FUNCS(uint8_t, 8, ssse3)
55 LFL_FUNCS(uint8_t, 10, ssse3)
56 LFL_FUNCS(uint8_t, 12, ssse3)
57 LFL_FUNCS(uint8_t, 8, avx)
58 LFL_FUNCS(uint8_t, 10, avx)
59 LFL_FUNCS(uint8_t, 12, avx)
60 
61 #define IDCT_DC_FUNCS(W, opt) \
62 void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
63 void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
64 void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
65 
66 IDCT_DC_FUNCS(4x4, mmxext);
67 IDCT_DC_FUNCS(8x8, sse2);
68 IDCT_DC_FUNCS(16x16, sse2);
69 IDCT_DC_FUNCS(32x32, sse2);
70 IDCT_DC_FUNCS(16x16, avx2);
71 IDCT_DC_FUNCS(32x32, avx2);
72 
73 #define IDCT_FUNCS(opt) \
74 void ff_hevc_idct_4x4_8_ ## opt(int16_t *coeffs, int col_limit); \
75 void ff_hevc_idct_4x4_10_ ## opt(int16_t *coeffs, int col_limit); \
76 void ff_hevc_idct_8x8_8_ ## opt(int16_t *coeffs, int col_limit); \
77 void ff_hevc_idct_8x8_10_ ## opt(int16_t *coeffs, int col_limit); \
78 void ff_hevc_idct_16x16_8_ ## opt(int16_t *coeffs, int col_limit); \
79 void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
80 void ff_hevc_idct_32x32_8_ ## opt(int16_t *coeffs, int col_limit); \
81 void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
82 
83 IDCT_FUNCS(sse2)
84 IDCT_FUNCS(avx)
85 
86 #define mc_rep_func(name, bitd, step, W, opt) \
87 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
88  const uint8_t *_src, ptrdiff_t _srcstride, int height, \
89  intptr_t mx, intptr_t my, int width) \
90 { \
91  int i; \
92  int16_t *dst; \
93  for (i = 0; i < W; i += step) { \
94  const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
95  dst = _dst + i; \
96  ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
97  } \
98 }
99 #define mc_rep_uni_func(name, bitd, step, W, opt) \
100 void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
101  const uint8_t *_src, ptrdiff_t _srcstride, int height, \
102  intptr_t mx, intptr_t my, int width) \
103 { \
104  int i; \
105  uint8_t *dst; \
106  for (i = 0; i < W; i += step) { \
107  const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
108  dst = _dst + (i * ((bitd + 7) / 8)); \
109  ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
110  height, mx, my, width); \
111  } \
112 }
113 #define mc_rep_bi_func(name, bitd, step, W, opt) \
114 void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, const uint8_t *_src, \
115  ptrdiff_t _srcstride, const int16_t *_src2, \
116  int height, intptr_t mx, intptr_t my, int width) \
117 { \
118  int i; \
119  uint8_t *dst; \
120  for (i = 0; i < W ; i += step) { \
121  const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
122  const int16_t *src2 = _src2 + i; \
123  dst = _dst + (i * ((bitd + 7) / 8)); \
124  ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
125  height, mx, my, width); \
126  } \
127 }
128 
129 #define mc_rep_funcs(name, bitd, step, W, opt) \
130  mc_rep_func(name, bitd, step, W, opt) \
131  mc_rep_uni_func(name, bitd, step, W, opt) \
132  mc_rep_bi_func(name, bitd, step, W, opt)
133 
134 #define mc_rep_func2(name, bitd, step1, step2, W, opt) \
135 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \
136  const uint8_t *src, ptrdiff_t _srcstride, int height, \
137  intptr_t mx, intptr_t my, int width) \
138 { \
139  ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
140  ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
141  _srcstride, height, mx, my, width); \
142 }
143 #define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
144 void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
145  const uint8_t *src, ptrdiff_t _srcstride, int height, \
146  intptr_t mx, intptr_t my, int width) \
147 { \
148  ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
149  ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
150  src + (step1 * ((bitd + 7) / 8)), _srcstride, \
151  height, mx, my, width); \
152 }
153 #define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
154 void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
155  ptrdiff_t _srcstride, const int16_t *src2, \
156  int height, intptr_t mx, intptr_t my, int width) \
157 { \
158  ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
159  ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
160  src + (step1 * ((bitd + 7) / 8)), _srcstride, \
161  src2 + step1, height, mx, my, width); \
162 }
163 
164 #define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
165  mc_rep_func2(name, bitd, step1, step2, W, opt) \
166  mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
167  mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
168 
169 #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
170 
171 #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
172 void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, const uint8_t *src, ptrdiff_t _srcstride, \
173  int height, intptr_t mx, intptr_t my, int width) \
174  \
175 { \
176  ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
177  ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
178 }
179 
180 #define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
181 void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
182  ptrdiff_t _srcstride, const int16_t *src2, \
183  int height, intptr_t mx, intptr_t my, int width) \
184 { \
185  ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
186  height, mx, my, width); \
187  ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
188  height, mx, my, width); \
189 }
190 
191 #define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
192 void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
193  const uint8_t *src, ptrdiff_t _srcstride, int height, \
194  intptr_t mx, intptr_t my, int width) \
195 { \
196  ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
197  height, mx, my, width); \
198  ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
199  height, mx, my, width); \
200 }
201 
202 #define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
203 mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
204 mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
205 mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
206 
207 #define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
208 void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, const uint8_t *src, ptrdiff_t _srcstride, \
209  int height, intptr_t mx, intptr_t my, int width) \
210  \
211 { \
212  ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
213  ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
214 }
215 
216 #define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
217 void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
218  ptrdiff_t _srcstride, const int16_t *src2, \
219  int height, intptr_t mx, intptr_t my, int width) \
220 { \
221  ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
222  src2, height, mx, my, width); \
223  ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
224  src2+width2, height, mx, my, width); \
225 }
226 
227 #define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
228 void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
229  const uint8_t *src, ptrdiff_t _srcstride, int height, \
230  intptr_t mx, intptr_t my, int width) \
231 { \
232  ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
233  height, mx, my, width); \
234  ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
235  height, mx, my, width); \
236 }
237 
238 #define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
239 mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
240 mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
241 mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
242 
243 #if HAVE_AVX2_EXTERNAL
244 
245 mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
246 mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4)
247 mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4)
248 mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4)
249 
250 mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
251 mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
252 mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32)
253 mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32)
254 mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32)
255 
256 
257 mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32)
258 mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32)
259 mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32)
260 
261 
262 mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
263 mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
264 
265 mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
266 
267 mc_rep_func(pel_pixels, 10, 16, 32, avx2)
268 mc_rep_func(pel_pixels, 10, 16, 48, avx2)
269 mc_rep_func(pel_pixels, 10, 32, 64, avx2)
270 
271 mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
272 mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
273 mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
274 
275 mc_rep_funcs(epel_h, 8, 32, 64, avx2)
276 
277 mc_rep_funcs(epel_v, 8, 32, 64, avx2)
278 
279 mc_rep_funcs(epel_h, 10, 16, 32, avx2)
280 mc_rep_funcs(epel_h, 10, 16, 48, avx2)
281 mc_rep_funcs(epel_h, 10, 32, 64, avx2)
282 
283 mc_rep_funcs(epel_v, 10, 16, 32, avx2)
284 mc_rep_funcs(epel_v, 10, 16, 48, avx2)
285 mc_rep_funcs(epel_v, 10, 32, 64, avx2)
286 
287 
288 mc_rep_funcs(epel_hv, 8, 32, 64, avx2)
289 
290 mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
291 mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
292 mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
293 
294 mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
295 mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4)
296 
297 mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
298 mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4)
299 
300 mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
301 mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
302 mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
303 
304 mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
305 mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
306 mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
307 
308 mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
309 mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
310 mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
311 
312 #endif //AVX2
313 
314 mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
315 mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
316 mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
317 mc_rep_funcs(pel_pixels, 8, 8, 24, sse4)
318 mc_rep_funcs(pel_pixels,10, 8, 64, sse4)
319 mc_rep_funcs(pel_pixels,10, 8, 48, sse4)
320 mc_rep_funcs(pel_pixels,10, 8, 32, sse4)
321 mc_rep_funcs(pel_pixels,10, 8, 24, sse4)
322 mc_rep_funcs(pel_pixels,10, 8, 16, sse4)
323 mc_rep_funcs(pel_pixels,10, 4, 12, sse4)
324 mc_rep_funcs(pel_pixels,12, 8, 64, sse4)
325 mc_rep_funcs(pel_pixels,12, 8, 48, sse4)
326 mc_rep_funcs(pel_pixels,12, 8, 32, sse4)
327 mc_rep_funcs(pel_pixels,12, 8, 24, sse4)
328 mc_rep_funcs(pel_pixels,12, 8, 16, sse4)
329 mc_rep_funcs(pel_pixels,12, 4, 12, sse4)
330 
331 mc_rep_funcs(epel_h, 8, 16, 64, sse4)
332 mc_rep_funcs(epel_h, 8, 16, 48, sse4)
333 mc_rep_funcs(epel_h, 8, 16, 32, sse4)
334 mc_rep_funcs(epel_h, 8, 8, 24, sse4)
335 mc_rep_funcs(epel_h,10, 8, 64, sse4)
336 mc_rep_funcs(epel_h,10, 8, 48, sse4)
337 mc_rep_funcs(epel_h,10, 8, 32, sse4)
338 mc_rep_funcs(epel_h,10, 8, 24, sse4)
339 mc_rep_funcs(epel_h,10, 8, 16, sse4)
340 mc_rep_funcs(epel_h,10, 4, 12, sse4)
341 mc_rep_funcs(epel_h,12, 8, 64, sse4)
342 mc_rep_funcs(epel_h,12, 8, 48, sse4)
343 mc_rep_funcs(epel_h,12, 8, 32, sse4)
344 mc_rep_funcs(epel_h,12, 8, 24, sse4)
345 mc_rep_funcs(epel_h,12, 8, 16, sse4)
346 mc_rep_funcs(epel_h,12, 4, 12, sse4)
347 mc_rep_funcs(epel_v, 8, 16, 64, sse4)
348 mc_rep_funcs(epel_v, 8, 16, 48, sse4)
349 mc_rep_funcs(epel_v, 8, 16, 32, sse4)
350 mc_rep_funcs(epel_v, 8, 8, 24, sse4)
351 mc_rep_funcs(epel_v,10, 8, 64, sse4)
352 mc_rep_funcs(epel_v,10, 8, 48, sse4)
353 mc_rep_funcs(epel_v,10, 8, 32, sse4)
354 mc_rep_funcs(epel_v,10, 8, 24, sse4)
355 mc_rep_funcs(epel_v,10, 8, 16, sse4)
356 mc_rep_funcs(epel_v,10, 4, 12, sse4)
357 mc_rep_funcs(epel_v,12, 8, 64, sse4)
358 mc_rep_funcs(epel_v,12, 8, 48, sse4)
359 mc_rep_funcs(epel_v,12, 8, 32, sse4)
360 mc_rep_funcs(epel_v,12, 8, 24, sse4)
361 mc_rep_funcs(epel_v,12, 8, 16, sse4)
362 mc_rep_funcs(epel_v,12, 4, 12, sse4)
363 mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
364 mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
365 mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
366 mc_rep_funcs(epel_hv, 8, 8, 24, sse4)
367 mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4)
368 mc_rep_funcs(epel_hv,10, 8, 64, sse4)
369 mc_rep_funcs(epel_hv,10, 8, 48, sse4)
370 mc_rep_funcs(epel_hv,10, 8, 32, sse4)
371 mc_rep_funcs(epel_hv,10, 8, 24, sse4)
372 mc_rep_funcs(epel_hv,10, 8, 16, sse4)
373 mc_rep_funcs(epel_hv,10, 4, 12, sse4)
374 mc_rep_funcs(epel_hv,12, 8, 64, sse4)
375 mc_rep_funcs(epel_hv,12, 8, 48, sse4)
376 mc_rep_funcs(epel_hv,12, 8, 32, sse4)
377 mc_rep_funcs(epel_hv,12, 8, 24, sse4)
378 mc_rep_funcs(epel_hv,12, 8, 16, sse4)
379 mc_rep_funcs(epel_hv,12, 4, 12, sse4)
380 
381 mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
382 mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
383 mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
384 mc_rep_funcs(qpel_h, 8, 8, 24, sse4)
385 mc_rep_funcs(qpel_h,10, 8, 64, sse4)
386 mc_rep_funcs(qpel_h,10, 8, 48, sse4)
387 mc_rep_funcs(qpel_h,10, 8, 32, sse4)
388 mc_rep_funcs(qpel_h,10, 8, 24, sse4)
389 mc_rep_funcs(qpel_h,10, 8, 16, sse4)
390 mc_rep_funcs(qpel_h,10, 4, 12, sse4)
391 mc_rep_funcs(qpel_h,12, 8, 64, sse4)
392 mc_rep_funcs(qpel_h,12, 8, 48, sse4)
393 mc_rep_funcs(qpel_h,12, 8, 32, sse4)
394 mc_rep_funcs(qpel_h,12, 8, 24, sse4)
395 mc_rep_funcs(qpel_h,12, 8, 16, sse4)
396 mc_rep_funcs(qpel_h,12, 4, 12, sse4)
397 mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
398 mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
399 mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
400 mc_rep_funcs(qpel_v, 8, 8, 24, sse4)
401 mc_rep_funcs(qpel_v,10, 8, 64, sse4)
402 mc_rep_funcs(qpel_v,10, 8, 48, sse4)
403 mc_rep_funcs(qpel_v,10, 8, 32, sse4)
404 mc_rep_funcs(qpel_v,10, 8, 24, sse4)
405 mc_rep_funcs(qpel_v,10, 8, 16, sse4)
406 mc_rep_funcs(qpel_v,10, 4, 12, sse4)
407 mc_rep_funcs(qpel_v,12, 8, 64, sse4)
408 mc_rep_funcs(qpel_v,12, 8, 48, sse4)
409 mc_rep_funcs(qpel_v,12, 8, 32, sse4)
410 mc_rep_funcs(qpel_v,12, 8, 24, sse4)
411 mc_rep_funcs(qpel_v,12, 8, 16, sse4)
412 mc_rep_funcs(qpel_v,12, 4, 12, sse4)
413 mc_rep_funcs(qpel_hv, 8, 8, 64, sse4)
414 mc_rep_funcs(qpel_hv, 8, 8, 48, sse4)
415 mc_rep_funcs(qpel_hv, 8, 8, 32, sse4)
416 mc_rep_funcs(qpel_hv, 8, 8, 24, sse4)
417 mc_rep_funcs(qpel_hv, 8, 8, 16, sse4)
418 mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4)
419 mc_rep_funcs(qpel_hv,10, 8, 64, sse4)
420 mc_rep_funcs(qpel_hv,10, 8, 48, sse4)
421 mc_rep_funcs(qpel_hv,10, 8, 32, sse4)
422 mc_rep_funcs(qpel_hv,10, 8, 24, sse4)
423 mc_rep_funcs(qpel_hv,10, 8, 16, sse4)
424 mc_rep_funcs(qpel_hv,10, 4, 12, sse4)
425 mc_rep_funcs(qpel_hv,12, 8, 64, sse4)
426 mc_rep_funcs(qpel_hv,12, 8, 48, sse4)
427 mc_rep_funcs(qpel_hv,12, 8, 32, sse4)
428 mc_rep_funcs(qpel_hv,12, 8, 24, sse4)
429 mc_rep_funcs(qpel_hv,12, 8, 16, sse4)
430 mc_rep_funcs(qpel_hv,12, 4, 12, sse4)
431 
432 #define mc_rep_uni_w(bitd, step, W, opt) \
433 void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, const int16_t *_src, \
434  int height, int denom, int _wx, int _ox) \
435 { \
436  int i; \
437  uint8_t *dst; \
438  for (i = 0; i < W; i += step) { \
439  const int16_t *src = _src + i; \
440  dst= _dst + (i * ((bitd + 7) / 8)); \
441  ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \
442  height, denom, _wx, _ox); \
443  } \
444 }
445 
446 mc_rep_uni_w(8, 6, 12, sse4)
447 mc_rep_uni_w(8, 8, 16, sse4)
448 mc_rep_uni_w(8, 8, 24, sse4)
449 mc_rep_uni_w(8, 8, 32, sse4)
450 mc_rep_uni_w(8, 8, 48, sse4)
451 mc_rep_uni_w(8, 8, 64, sse4)
452 
453 mc_rep_uni_w(10, 6, 12, sse4)
454 mc_rep_uni_w(10, 8, 16, sse4)
455 mc_rep_uni_w(10, 8, 24, sse4)
456 mc_rep_uni_w(10, 8, 32, sse4)
457 mc_rep_uni_w(10, 8, 48, sse4)
458 mc_rep_uni_w(10, 8, 64, sse4)
459 
460 mc_rep_uni_w(12, 6, 12, sse4)
461 mc_rep_uni_w(12, 8, 16, sse4)
462 mc_rep_uni_w(12, 8, 24, sse4)
463 mc_rep_uni_w(12, 8, 32, sse4)
464 mc_rep_uni_w(12, 8, 48, sse4)
465 mc_rep_uni_w(12, 8, 64, sse4)
466 
467 #define mc_rep_bi_w(bitd, step, W, opt) \
468 void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, const int16_t *_src, \
469  const int16_t *_src2, int height, \
470  int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
471 { \
472  int i; \
473  uint8_t *dst; \
474  for (i = 0; i < W; i += step) { \
475  const int16_t *src = _src + i; \
476  const int16_t *src2 = _src2 + i; \
477  dst = _dst + (i * ((bitd + 7) / 8)); \
478  ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \
479  height, denom, _wx0, _wx1, _ox0, _ox1); \
480  } \
481 }
482 
483 mc_rep_bi_w(8, 6, 12, sse4)
484 mc_rep_bi_w(8, 8, 16, sse4)
485 mc_rep_bi_w(8, 8, 24, sse4)
486 mc_rep_bi_w(8, 8, 32, sse4)
487 mc_rep_bi_w(8, 8, 48, sse4)
488 mc_rep_bi_w(8, 8, 64, sse4)
489 
490 mc_rep_bi_w(10, 6, 12, sse4)
491 mc_rep_bi_w(10, 8, 16, sse4)
492 mc_rep_bi_w(10, 8, 24, sse4)
493 mc_rep_bi_w(10, 8, 32, sse4)
494 mc_rep_bi_w(10, 8, 48, sse4)
495 mc_rep_bi_w(10, 8, 64, sse4)
496 
497 mc_rep_bi_w(12, 6, 12, sse4)
498 mc_rep_bi_w(12, 8, 16, sse4)
499 mc_rep_bi_w(12, 8, 24, sse4)
500 mc_rep_bi_w(12, 8, 32, sse4)
501 mc_rep_bi_w(12, 8, 48, sse4)
502 mc_rep_bi_w(12, 8, 64, sse4)
503 
504 #define mc_uni_w_func(name, bitd, W, opt) \
505 void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
506  const uint8_t *_src, ptrdiff_t _srcstride, \
507  int height, int denom, \
508  int _wx, int _ox, \
509  intptr_t mx, intptr_t my, int width) \
510 { \
511  LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
512  ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
513  ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
514 }
515 
516 #define mc_uni_w_funcs(name, bitd, opt) \
517  mc_uni_w_func(name, bitd, 4, opt) \
518  mc_uni_w_func(name, bitd, 8, opt) \
519  mc_uni_w_func(name, bitd, 12, opt) \
520  mc_uni_w_func(name, bitd, 16, opt) \
521  mc_uni_w_func(name, bitd, 24, opt) \
522  mc_uni_w_func(name, bitd, 32, opt) \
523  mc_uni_w_func(name, bitd, 48, opt) \
524  mc_uni_w_func(name, bitd, 64, opt)
525 
526 mc_uni_w_funcs(pel_pixels, 8, sse4)
527 mc_uni_w_func(pel_pixels, 8, 6, sse4)
528 mc_uni_w_funcs(epel_h, 8, sse4)
529 mc_uni_w_func(epel_h, 8, 6, sse4)
530 mc_uni_w_funcs(epel_v, 8, sse4)
531 mc_uni_w_func(epel_v, 8, 6, sse4)
532 mc_uni_w_funcs(epel_hv, 8, sse4)
533 mc_uni_w_func(epel_hv, 8, 6, sse4)
534 mc_uni_w_funcs(qpel_h, 8, sse4)
535 mc_uni_w_funcs(qpel_v, 8, sse4)
536 mc_uni_w_funcs(qpel_hv, 8, sse4)
537 
538 mc_uni_w_funcs(pel_pixels, 10, sse4)
539 mc_uni_w_func(pel_pixels, 10, 6, sse4)
540 mc_uni_w_funcs(epel_h, 10, sse4)
541 mc_uni_w_func(epel_h, 10, 6, sse4)
542 mc_uni_w_funcs(epel_v, 10, sse4)
543 mc_uni_w_func(epel_v, 10, 6, sse4)
544 mc_uni_w_funcs(epel_hv, 10, sse4)
545 mc_uni_w_func(epel_hv, 10, 6, sse4)
546 mc_uni_w_funcs(qpel_h, 10, sse4)
547 mc_uni_w_funcs(qpel_v, 10, sse4)
548 mc_uni_w_funcs(qpel_hv, 10, sse4)
549 
550 mc_uni_w_funcs(pel_pixels, 12, sse4)
551 mc_uni_w_func(pel_pixels, 12, 6, sse4)
552 mc_uni_w_funcs(epel_h, 12, sse4)
553 mc_uni_w_func(epel_h, 12, 6, sse4)
554 mc_uni_w_funcs(epel_v, 12, sse4)
555 mc_uni_w_func(epel_v, 12, 6, sse4)
556 mc_uni_w_funcs(epel_hv, 12, sse4)
557 mc_uni_w_func(epel_hv, 12, 6, sse4)
558 mc_uni_w_funcs(qpel_h, 12, sse4)
559 mc_uni_w_funcs(qpel_v, 12, sse4)
560 mc_uni_w_funcs(qpel_hv, 12, sse4)
561 
562 #define mc_bi_w_func(name, bitd, W, opt) \
563 void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
564  const uint8_t *_src, ptrdiff_t _srcstride, \
565  const int16_t *_src2, \
566  int height, int denom, \
567  int _wx0, int _wx1, int _ox0, int _ox1, \
568  intptr_t mx, intptr_t my, int width) \
569 { \
570  LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
571  ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
572  ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \
573  height, denom, _wx0, _wx1, _ox0, _ox1); \
574 }
575 
576 #define mc_bi_w_funcs(name, bitd, opt) \
577  mc_bi_w_func(name, bitd, 4, opt) \
578  mc_bi_w_func(name, bitd, 8, opt) \
579  mc_bi_w_func(name, bitd, 12, opt) \
580  mc_bi_w_func(name, bitd, 16, opt) \
581  mc_bi_w_func(name, bitd, 24, opt) \
582  mc_bi_w_func(name, bitd, 32, opt) \
583  mc_bi_w_func(name, bitd, 48, opt) \
584  mc_bi_w_func(name, bitd, 64, opt)
585 
586 mc_bi_w_funcs(pel_pixels, 8, sse4)
587 mc_bi_w_func(pel_pixels, 8, 6, sse4)
588 mc_bi_w_funcs(epel_h, 8, sse4)
589 mc_bi_w_func(epel_h, 8, 6, sse4)
590 mc_bi_w_funcs(epel_v, 8, sse4)
591 mc_bi_w_func(epel_v, 8, 6, sse4)
592 mc_bi_w_funcs(epel_hv, 8, sse4)
593 mc_bi_w_func(epel_hv, 8, 6, sse4)
594 mc_bi_w_funcs(qpel_h, 8, sse4)
595 mc_bi_w_funcs(qpel_v, 8, sse4)
596 mc_bi_w_funcs(qpel_hv, 8, sse4)
597 
598 mc_bi_w_funcs(pel_pixels, 10, sse4)
599 mc_bi_w_func(pel_pixels, 10, 6, sse4)
600 mc_bi_w_funcs(epel_h, 10, sse4)
601 mc_bi_w_func(epel_h, 10, 6, sse4)
602 mc_bi_w_funcs(epel_v, 10, sse4)
603 mc_bi_w_func(epel_v, 10, 6, sse4)
604 mc_bi_w_funcs(epel_hv, 10, sse4)
605 mc_bi_w_func(epel_hv, 10, 6, sse4)
606 mc_bi_w_funcs(qpel_h, 10, sse4)
607 mc_bi_w_funcs(qpel_v, 10, sse4)
608 mc_bi_w_funcs(qpel_hv, 10, sse4)
609 
610 mc_bi_w_funcs(pel_pixels, 12, sse4)
611 mc_bi_w_func(pel_pixels, 12, 6, sse4)
612 mc_bi_w_funcs(epel_h, 12, sse4)
613 mc_bi_w_func(epel_h, 12, 6, sse4)
614 mc_bi_w_funcs(epel_v, 12, sse4)
615 mc_bi_w_func(epel_v, 12, 6, sse4)
616 mc_bi_w_funcs(epel_hv, 12, sse4)
617 mc_bi_w_func(epel_hv, 12, 6, sse4)
618 mc_bi_w_funcs(qpel_h, 12, sse4)
619 mc_bi_w_funcs(qpel_v, 12, sse4)
620 mc_bi_w_funcs(qpel_hv, 12, sse4)
621 #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
622 
623 #define SAO_BAND_FILTER_FUNCS(bitd, opt) \
624 void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
625  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
626 void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
627  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
628 void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
629  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
630 void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
631  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
632 void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
633  const int16_t *sao_offset_val, int sao_left_class, int width, int height);
634 
635 SAO_BAND_FILTER_FUNCS(8, sse2)
636 SAO_BAND_FILTER_FUNCS(10, sse2)
637 SAO_BAND_FILTER_FUNCS(12, sse2)
638 SAO_BAND_FILTER_FUNCS(8, avx)
639 SAO_BAND_FILTER_FUNCS(10, avx)
640 SAO_BAND_FILTER_FUNCS(12, avx)
641 SAO_BAND_FILTER_FUNCS(8, avx2)
642 SAO_BAND_FILTER_FUNCS(10, avx2)
643 SAO_BAND_FILTER_FUNCS(12, avx2)
644 
645 #define SAO_BAND_INIT(bitd, opt) do { \
646  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
647  c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
648  c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
649  c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
650  c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
651 } while (0)
652 
653 #define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
654 void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
655  const int16_t *sao_offset_val, int eo, int width, int height); \
656 void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
657  const int16_t *sao_offset_val, int eo, int width, int height); \
658 void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
659  const int16_t *sao_offset_val, int eo, int width, int height); \
660 void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
661  const int16_t *sao_offset_val, int eo, int width, int height); \
662 void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
663  const int16_t *sao_offset_val, int eo, int width, int height); \
664 
665 SAO_EDGE_FILTER_FUNCS(8, ssse3)
666 SAO_EDGE_FILTER_FUNCS(8, avx2)
667 SAO_EDGE_FILTER_FUNCS(10, sse2)
668 SAO_EDGE_FILTER_FUNCS(10, avx2)
669 SAO_EDGE_FILTER_FUNCS(12, sse2)
670 SAO_EDGE_FILTER_FUNCS(12, avx2)
671 
672 #define SAO_EDGE_INIT(bitd, opt) do { \
673  c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
674  c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
675  c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
676  c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
677  c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
678 } while (0)
679 
680 #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
681  PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
682  PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
683  PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
684  PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
685  PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
686  PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
687  PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
688  PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
689  PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
690 #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
691  PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
692  PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
693  PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
694  PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
695  PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
696  PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
697  PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
698  PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
699 
701 {
702  int cpu_flags = av_get_cpu_flags();
703 
704  if (bit_depth == 8) {
705  if (EXTERNAL_MMXEXT(cpu_flags)) {
706  c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
707 
708  c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
709  }
710  if (EXTERNAL_SSE2(cpu_flags)) {
711  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
712  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
713  if (ARCH_X86_64) {
714  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
715  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
716 
717  c->idct[2] = ff_hevc_idct_16x16_8_sse2;
718  c->idct[3] = ff_hevc_idct_32x32_8_sse2;
719  }
720  SAO_BAND_INIT(8, sse2);
721 
722  c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
723  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
724  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
725 
726  c->idct[0] = ff_hevc_idct_4x4_8_sse2;
727  c->idct[1] = ff_hevc_idct_8x8_8_sse2;
728 
729  c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
730  c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
731  c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
732  }
733  if (EXTERNAL_SSSE3(cpu_flags)) {
734  if(ARCH_X86_64) {
735  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
736  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
737  }
738  SAO_EDGE_INIT(8, ssse3);
739  }
740  if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
741 
742  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
743  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
744  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
745  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
746 
747  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
748  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
749  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
750  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
751  }
752  if (EXTERNAL_AVX(cpu_flags)) {
753  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
754  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
755  if (ARCH_X86_64) {
756  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
757  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
758 
759  c->idct[2] = ff_hevc_idct_16x16_8_avx;
760  c->idct[3] = ff_hevc_idct_32x32_8_avx;
761  }
762  SAO_BAND_INIT(8, avx);
763 
764  c->idct[0] = ff_hevc_idct_4x4_8_avx;
765  c->idct[1] = ff_hevc_idct_8x8_8_avx;
766 
767  c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
768  c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
769  c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
770  }
771  if (EXTERNAL_AVX2(cpu_flags)) {
772  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
773  c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
774  }
776  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
777  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
778  if (ARCH_X86_64) {
779  c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
780  c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
781  c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
782 
783  c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
784  c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
785  c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
786 
787  c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
788  c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
789  c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
790 
791  c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
792  c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
793  c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
794 
795  c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
796  c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
797  c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
798 
799  c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
800  c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
801  c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
802 
803  c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
804  c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
805  c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
806 
807  c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
808  c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
809  c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
810 
811  c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
812  c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
813  c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
814 
815  c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
816  c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
817  c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
818 
819  c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
820  c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
821  c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
822 
823  c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
824  c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
825  c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
826 
827  c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
828  c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
829  c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
830 
831  c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
832  c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
833  c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
834 
835  c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
836  c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
837  c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
838 
839  c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
840  c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
841  c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
842 
843  c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
844  c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
845  c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
846 
847  c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
848  c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
849  c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
850 
851  c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
852  c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
853  c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
854 
855  c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
856  c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
857  c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
858 
859  c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
860  c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
861  c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
862  }
863  SAO_BAND_INIT(8, avx2);
864 
865  c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
866  c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
867  c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
868 
869  c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
870  }
871  if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
872  c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_avx512icl;
873  c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
874  c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
875  c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx512icl;
876  c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx512icl;
877  c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
878  }
879  } else if (bit_depth == 10) {
880  if (EXTERNAL_MMXEXT(cpu_flags)) {
881  c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
882  c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
883  }
884  if (EXTERNAL_SSE2(cpu_flags)) {
885  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
886  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
887  if (ARCH_X86_64) {
888  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
889  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
890 
891  c->idct[2] = ff_hevc_idct_16x16_10_sse2;
892  c->idct[3] = ff_hevc_idct_32x32_10_sse2;
893  }
894  SAO_BAND_INIT(10, sse2);
895  SAO_EDGE_INIT(10, sse2);
896 
897  c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
898  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
899  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
900 
901  c->idct[0] = ff_hevc_idct_4x4_10_sse2;
902  c->idct[1] = ff_hevc_idct_8x8_10_sse2;
903 
904  c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
905  c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
906  c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
907  }
908  if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
909  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
910  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
911  }
912  if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
913  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
914  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
915  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
916  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
917 
918  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
919  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
920  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
921  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
922  }
923  if (EXTERNAL_AVX(cpu_flags)) {
924  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
925  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
926  if (ARCH_X86_64) {
927  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
928  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
929 
930  c->idct[2] = ff_hevc_idct_16x16_10_avx;
931  c->idct[3] = ff_hevc_idct_32x32_10_avx;
932  }
933 
934  c->idct[0] = ff_hevc_idct_4x4_10_avx;
935  c->idct[1] = ff_hevc_idct_8x8_10_avx;
936 
937  SAO_BAND_INIT(10, avx);
938  }
939  if (EXTERNAL_AVX2(cpu_flags)) {
940  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
941  }
943  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
944  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
945  if (ARCH_X86_64) {
946  c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
947  c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
948  c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
949  c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
950  c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
951 
952  c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
953  c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
954  c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
955  c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
956  c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
957 
958  c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
959  c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
960  c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
961  c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
962  c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
963 
964  c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
965  c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
966  c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
967  c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
968  c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
969 
970  c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
971  c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
972  c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
973  c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
974  c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
975  c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
976  c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
977  c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
978  c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
979  c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
980 
981  c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
982  c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
983  c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
984  c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
985  c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
986 
987  c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
988  c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
989  c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
990  c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
991  c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
992 
993  c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
994  c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
995  c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
996  c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
997  c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
998 
999  c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
1000  c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
1001  c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
1002  c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
1003  c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
1004 
1005  c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
1006  c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
1007  c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
1008  c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
1009  c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
1010 
1011  c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
1012  c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
1013  c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
1014  c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
1015  c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
1016 
1017  c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
1018  c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
1019  c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
1020  c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
1021  c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
1022 
1023  c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
1024  c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
1025  c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
1026  c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
1027  c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
1028 
1029  c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
1030  c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
1031  c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
1032  c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
1033  c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
1034 
1035  c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
1036  c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
1037  c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
1038  c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
1039  c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
1040 
1041  c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
1042  c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
1043  c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
1044  c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
1045  c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
1046 
1047  c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
1048  c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
1049  c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
1050  c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
1051  c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
1052 
1053  c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
1054  c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
1055  c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
1056  c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
1057  c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
1058 
1059  c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
1060  c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
1061  c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
1062  c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
1063  c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
1064 
1065  c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
1066  c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
1067  c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
1068  c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
1069  c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
1070 
1071  c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
1072  c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
1073  c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
1074  c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
1075  c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
1076 
1077  c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
1078  c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
1079  c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
1080  c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
1081  c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
1082 
1083  c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
1084  c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
1085  c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
1086  c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
1087  c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
1088  }
1089  SAO_BAND_INIT(10, avx2);
1090  SAO_EDGE_INIT(10, avx2);
1091 
1092  c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
1093  c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
1094  }
1095  } else if (bit_depth == 12) {
1096  if (EXTERNAL_MMXEXT(cpu_flags)) {
1097  c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
1098  }
1099  if (EXTERNAL_SSE2(cpu_flags)) {
1100  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
1101  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
1102  if (ARCH_X86_64) {
1103  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
1104  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
1105  }
1106  SAO_BAND_INIT(12, sse2);
1107  SAO_EDGE_INIT(12, sse2);
1108 
1109  c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
1110  c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
1111  c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
1112  }
1113  if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
1114  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
1115  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
1116  }
1117  if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
1118  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
1119  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
1120  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
1121  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
1122 
1123  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
1124  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
1125  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
1126  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
1127  }
1128  if (EXTERNAL_AVX(cpu_flags)) {
1129  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
1130  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
1131  if (ARCH_X86_64) {
1132  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
1133  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
1134  }
1135  SAO_BAND_INIT(12, avx);
1136  }
1137  if (EXTERNAL_AVX2(cpu_flags)) {
1138  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
1139  }
1141  c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
1142  c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
1143 
1144  SAO_BAND_INIT(12, avx2);
1145  SAO_EDGE_INIT(12, avx2);
1146  }
1147  }
1148 }
ff_hevc_put_hevc_pel_pixels64_10_avx2
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
cpu.h
LFL_FUNCS
#define LFL_FUNCS(type, depth, opt)
Definition: hevcdsp_init.c:41
mem_internal.h
ff_hevc_put_hevc_uni_pel_pixels64_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_qpel_h8_8_avx512icl
void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_uni_pel_pixels128_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_16_8_sse2
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
mc_rep_funcs2
#define mc_rep_funcs2(name, bitd, step1, step2, W, opt)
Definition: hevcdsp_init.c:164
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
mc_rep_func
#define mc_rep_func(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:86
ff_hevc_put_hevc_pel_pixels32_8_avx2
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels64_8_avx2
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
bit_depth
static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)
Definition: af_astats.c:245
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
QPEL_LINKS
#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)
Definition: hevcdsp_init.c:690
IDCT_FUNCS
#define IDCT_FUNCS(opt)
Definition: hevcdsp_init.c:73
ff_hevc_put_hevc_pel_pixels48_8_avx2
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
EPEL_LINKS
#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt)
Definition: hevcdsp_init.c:680
ff_hevc_put_hevc_uni_pel_pixels32_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels32_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_AVX2
#define EXTERNAL_AVX2(flags)
Definition: cpu.h:78
ff_hevc_put_hevc_bi_pel_pixels16_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels32_8_avx2
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_qpel_h32_8_avx512icl
void ff_hevc_put_hevc_qpel_h32_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
LFC_FUNCS
#define LFC_FUNCS(type, depth, opt)
Definition: hevcdsp_init.c:37
ff_hevc_put_hevc_uni_pel_pixels96_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels64_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
hevcdsp.h
ff_hevc_add_residual_8_8_avx
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_qpel_h64_8_avx512icl
void ff_hevc_put_hevc_qpel_h64_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_dsp_init_x86
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
Definition: hevcdsp_init.c:700
hevcdsp.h
ff_hevc_add_residual_4_8_mmxext
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_pel_pixels32_10_avx2
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels48_8_avx2
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_16_10_avx2
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_hevc_add_residual_32_8_avx
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_8_10_sse2
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_pel_pixels64_8_avx2
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels24_10_avx2
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
cpu.h
asm.h
ff_hevc_put_hevc_pel_pixels48_10_avx2
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_uni_pel_pixels48_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
HEVCDSPContext
Definition: hevcdsp.h:47
ff_hevc_add_residual_32_10_sse2
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
SAO_BAND_FILTER_FUNCS
#define SAO_BAND_FILTER_FUNCS(bitd, opt)
Definition: hevcdsp_init.c:623
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
SAO_EDGE_FILTER_FUNCS
#define SAO_EDGE_FILTER_FUNCS(bitd, opt)
Definition: hevcdsp_init.c:653
SAO_EDGE_INIT
#define SAO_EDGE_INIT(bitd, opt)
Definition: hevcdsp_init.c:672
ff_hevc_put_hevc_qpel_h4_8_avx512icl
void ff_hevc_put_hevc_qpel_h4_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
IDCT_DC_FUNCS
#define IDCT_DC_FUNCS(W, opt)
Definition: hevcdsp_init.c:61
ff_hevc_put_hevc_bi_pel_pixels24_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_8_8_sse2
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
mc_rep_funcs
#define mc_rep_funcs(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:129
ff_hevc_put_hevc_pel_pixels16_10_avx2
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
mc_rep_bi_func
#define mc_rep_bi_func(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:113
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
ff_hevc_add_residual_32_8_avx2
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
mc_rep_uni_func
#define mc_rep_uni_func(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:99
EXTERNAL_AVX512ICL
#define EXTERNAL_AVX512ICL(flags)
Definition: cpu.h:83
ff_hevc_add_residual_16_10_sse2
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
SAO_BAND_INIT
#define SAO_BAND_INIT(bitd, opt)
Definition: hevcdsp_init.c:645
ff_hevc_add_residual_16_8_avx
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_4_10_mmxext
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_qpel_hv8_8_avx512icl
void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_qpel_h16_8_avx512icl
void ff_hevc_put_hevc_qpel_h16_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_32_10_avx2
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_bi_pel_pixels48_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_hevc_add_residual_32_8_sse2
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)