Go to the documentation of this file.
36 #define bf(fn, bd, opt) fn##_##bd##_##opt
37 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
39 #define DMVR_PROTOTYPES(bd, opt) \
40 void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
41 int height, intptr_t mx, intptr_t my, int width); \
42 void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
43 int height, intptr_t mx, intptr_t my, int width); \
44 void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
45 int height, intptr_t mx, intptr_t my, int width); \
46 void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
47 int height, intptr_t mx, intptr_t my, int width); \
53 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
54 void ff_vvc_apply_bdof_avx2(uint8_t *
dst, ptrdiff_t dst_stride,
55 const int16_t *
src0,
const int16_t *
src1,
56 int w,
int h,
int pixel_max);
58 #define OF_FUNC(bd, opt) \
59 static void vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
60 const int16_t *src0, const int16_t *src1, int w, int h) \
62 ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd) - 1); \
69 #define OF_INIT(bd) c->inter.apply_bdof = vvc_apply_bdof_##bd##_avx2
72 #define ALF_BPC_PROTOTYPES(bpc, opt) \
73 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
74 const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
75 const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
76 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
77 const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
78 const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
79 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \
80 const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \
81 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \
82 intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \
84 ALF_BPC_PROTOTYPES(8, avx2)
85 ALF_BPC_PROTOTYPES(16, avx2)
88 #define FW_PUT(name, depth, opt) \
89 static void vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
90 int height, const int8_t *hf, const int8_t *vf, int width) \
92 ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
95 #if HAVE_SSE4_EXTERNAL
96 #define FW_PUT_TAP(fname, bitd, opt ) \
97 FW_PUT(fname##4, bitd, opt ) \
98 FW_PUT(fname##8, bitd, opt ) \
99 FW_PUT(fname##16, bitd, opt ) \
100 FW_PUT(fname##32, bitd, opt ) \
101 FW_PUT(fname##64, bitd, opt ) \
102 FW_PUT(fname##128, bitd, opt ) \
104 #define FW_PUT_4TAP(fname, bitd, opt) \
105 FW_PUT(fname ## 2, bitd, opt) \
106 FW_PUT_TAP(fname, bitd, opt)
108 #define FW_PUT_4TAP_SSE4(bitd) \
109 FW_PUT_4TAP(pixels, bitd, sse4) \
110 FW_PUT_4TAP(4tap_h, bitd, sse4) \
111 FW_PUT_4TAP(4tap_v, bitd, sse4) \
112 FW_PUT_4TAP(4tap_hv, bitd, sse4)
114 #define FW_PUT_8TAP_SSE4(bitd) \
115 FW_PUT_TAP(8tap_h, bitd, sse4) \
116 FW_PUT_TAP(8tap_v, bitd, sse4) \
117 FW_PUT_TAP(8tap_hv, bitd, sse4)
119 #define FW_PUT_SSE4(bitd) \
120 FW_PUT_4TAP_SSE4(bitd) \
121 FW_PUT_8TAP_SSE4(bitd)
128 #if HAVE_AVX2_EXTERNAL
129 #define FW_PUT_TAP_AVX2(n, bitd) \
130 FW_PUT(n ## tap_h32, bitd, avx2) \
131 FW_PUT(n ## tap_h64, bitd, avx2) \
132 FW_PUT(n ## tap_h128, bitd, avx2) \
133 FW_PUT(n ## tap_v32, bitd, avx2) \
134 FW_PUT(n ## tap_v64, bitd, avx2) \
135 FW_PUT(n ## tap_v128, bitd, avx2)
137 #define FW_PUT_AVX2(bitd) \
138 FW_PUT(pixels32, bitd, avx2) \
139 FW_PUT(pixels64, bitd, avx2) \
140 FW_PUT(pixels128, bitd, avx2) \
141 FW_PUT_TAP_AVX2(4, bitd) \
142 FW_PUT_TAP_AVX2(8, bitd) \
148 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
149 FW_PUT(n ## tap_h16, bitd, avx2) \
150 FW_PUT(n ## tap_v16, bitd, avx2) \
151 FW_PUT(n ## tap_hv16, bitd, avx2) \
152 FW_PUT(n ## tap_hv32, bitd, avx2) \
153 FW_PUT(n ## tap_hv64, bitd, avx2) \
154 FW_PUT(n ## tap_hv128, bitd, avx2)
156 #define FW_PUT_16BPC_AVX2(bitd) \
157 FW_PUT(pixels16, bitd, avx2) \
158 FW_PUT_TAP_16BPC_AVX2(4, bitd) \
159 FW_PUT_TAP_16BPC_AVX2(8, bitd)
161 FW_PUT_16BPC_AVX2(10)
162 FW_PUT_16BPC_AVX2(12)
164 #define ALF_FUNCS(bpc, bd, opt) \
165 static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
166 int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
168 const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \
169 BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
170 filter, clip, param_stride, vb_pos, (1 << bd) - 1); \
172 static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
173 int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
175 BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
176 filter, clip, 0, vb_pos,(1 << bd) - 1); \
178 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
179 const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \
181 BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \
182 BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \
185 ALF_FUNCS(8, 8, avx2)
186 ALF_FUNCS(16, 10, avx2)
187 ALF_FUNCS(16, 12, avx2)
191 #define SAO_FILTER_FUNC(wd, bitd, opt) \
192 void ff_vvc_sao_band_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
193 const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
194 void ff_vvc_sao_edge_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
195 const int16_t *sao_offset_val, int eo, int width, int height); \
197 #define SAO_FILTER_FUNCS(bitd, opt) \
198 SAO_FILTER_FUNC(8, bitd, opt) \
199 SAO_FILTER_FUNC(16, bitd, opt) \
200 SAO_FILTER_FUNC(32, bitd, opt) \
201 SAO_FILTER_FUNC(48, bitd, opt) \
202 SAO_FILTER_FUNC(64, bitd, opt) \
203 SAO_FILTER_FUNC(80, bitd, opt) \
204 SAO_FILTER_FUNC(96, bitd, opt) \
205 SAO_FILTER_FUNC(112, bitd, opt) \
206 SAO_FILTER_FUNC(128, bitd, opt) \
208 SAO_FILTER_FUNCS(8, avx2)
209 SAO_FILTER_FUNCS(10, avx2)
210 SAO_FILTER_FUNCS(12, avx2)
212 #define SAO_FILTER_INIT(type, bitd, opt) do { \
213 c->sao.type##_filter[0] = ff_vvc_sao_##type##_filter_8_##bitd##_##opt; \
214 c->sao.type##_filter[1] = ff_vvc_sao_##type##_filter_16_##bitd##_##opt; \
215 c->sao.type##_filter[2] = ff_vvc_sao_##type##_filter_32_##bitd##_##opt; \
216 c->sao.type##_filter[3] = ff_vvc_sao_##type##_filter_48_##bitd##_##opt; \
217 c->sao.type##_filter[4] = ff_vvc_sao_##type##_filter_64_##bitd##_##opt; \
218 c->sao.type##_filter[5] = ff_vvc_sao_##type##_filter_80_##bitd##_##opt; \
219 c->sao.type##_filter[6] = ff_vvc_sao_##type##_filter_96_##bitd##_##opt; \
220 c->sao.type##_filter[7] = ff_vvc_sao_##type##_filter_112_##bitd##_##opt; \
221 c->sao.type##_filter[8] = ff_vvc_sao_##type##_filter_128_##bitd##_##opt; \
224 #define SAO_INIT(bitd, opt) do { \
225 SAO_FILTER_INIT(band, bitd, opt); \
226 SAO_FILTER_INIT(edge, bitd, opt); \
229 #define AVG_INIT(bd, opt) do { \
230 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
231 const int16_t *src0, const int16_t *src1, int width, int height);\
232 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
233 const int16_t *src0, const int16_t *src1, int width, int height, \
234 int denom, int w0, int w1, int o0, int o1); \
235 c->inter.avg = bf(ff_vvc_avg, bd, opt); \
236 c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
239 #define DMVR_INIT(bd) do { \
240 c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_avx2; \
241 c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_avx2; \
242 c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_avx2; \
243 c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \
246 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
247 dst[C][W][idx1][idx2] = vvc_put_## name ## _ ## D ## _##opt; \
248 dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
250 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
251 PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
252 PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
253 PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
254 PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
255 PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
256 PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
258 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
259 MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
261 #define MC_8TAP_LINKS_SSE4(bd) \
262 MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
263 MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
264 MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
265 MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
267 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
268 PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
269 MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
271 #define MC_4TAP_LINKS_SSE4(bd) \
272 MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
273 MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
274 MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
275 MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
277 #define MC_LINK_SSE4(bd) \
278 MC_4TAP_LINKS_SSE4(bd) \
279 MC_8TAP_LINKS_SSE4(bd)
281 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
282 PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
283 PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
284 PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
285 PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
286 PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
287 PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
288 PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
289 PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
290 PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
293 #define MC_LINKS_AVX2(bd) \
294 MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
295 MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
297 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
298 PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
299 PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
300 PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
301 PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
302 PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
303 PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
304 PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
307 #define MC_LINKS_16BPC_AVX2(bd) \
308 MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
309 MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
311 int ff_vvc_sad_avx2(
const int16_t *
src0,
const int16_t *
src1,
int dx,
int dy,
int block_w,
int block_h);
312 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
314 #define ALF_INIT(bd) do { \
315 c->alf.filter[LUMA] = vvc_alf_filter_luma_##bd##_avx2; \
316 c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2; \
317 c->alf.classify = vvc_alf_classify_##bd##_avx2; \
323 #endif // ARCH_X86_64
332 #if HAVE_SSE4_EXTERNAL
337 #if HAVE_AVX2_EXTERNAL
353 #if HAVE_SSE4_EXTERNAL
358 #if HAVE_AVX2_EXTERNAL
364 MC_LINKS_16BPC_AVX2(10);
375 #if HAVE_SSE4_EXTERNAL
380 #if HAVE_AVX2_EXTERNAL
386 MC_LINKS_16BPC_AVX2(12);
#define EXTERNAL_AVX2_FAST(flags)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
static atomic_int cpu_flags
#define DMVR_PROTOTYPES(bd, opt)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
#define DMVR_INIT(bd, opt)
#define EXTERNAL_SSE4(flags)
av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)