FFmpeg
dsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC DSP init for x86
3  *
4  * Copyright (C) 2022-2024 Nuo Mi
5  * Copyright (c) 2023-2024 Wu Jianhua
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "config.h"
25 
26 #include "libavutil/attributes.h"
27 #include "libavutil/cpu.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/vvc/dec.h"
30 #include "libavcodec/vvc/ctu.h"
31 #include "libavcodec/vvc/dsp.h"
33 
34 #if ARCH_X86_64
35 
36 #define bf(fn, bd, opt) fn##_##bd##_##opt
37 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
38 
39 #define DMVR_PROTOTYPES(bd, opt) \
40 void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
41  int height, intptr_t mx, intptr_t my, int width); \
42 void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
43  int height, intptr_t mx, intptr_t my, int width); \
44 void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
45  int height, intptr_t mx, intptr_t my, int width); \
46 void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
47  int height, intptr_t mx, intptr_t my, int width); \
48 
49 DMVR_PROTOTYPES( 8, avx2)
50 DMVR_PROTOTYPES(10, avx2)
51 DMVR_PROTOTYPES(12, avx2)
52 
53 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
54 void ff_vvc_apply_bdof_avx2(uint8_t *dst, ptrdiff_t dst_stride,
55  const int16_t *src0, const int16_t *src1,
56  int w, int h, int pixel_max);
57 
58 #define OF_FUNC(bd, opt) \
59 static void vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
60  const int16_t *src0, const int16_t *src1, int w, int h) \
61 { \
62  ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd) - 1); \
63 } \
64 
65 OF_FUNC( 8, avx2)
66 OF_FUNC(10, avx2)
67 OF_FUNC(12, avx2)
68 
69 #define OF_INIT(bd) c->inter.apply_bdof = vvc_apply_bdof_##bd##_avx2
70 #endif
71 
72 #define ALF_BPC_PROTOTYPES(bpc, opt) \
73 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
74  const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
75  const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
76 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
77  const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
78  const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
79 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \
80  const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \
81 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \
82  intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \
83 
84 ALF_BPC_PROTOTYPES(8, avx2)
85 ALF_BPC_PROTOTYPES(16, avx2)
86 
87 #if ARCH_X86_64
88 #define FW_PUT(name, depth, opt) \
89 static void vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
90  int height, const int8_t *hf, const int8_t *vf, int width) \
91 { \
92  ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
93 }
94 
95 #if HAVE_SSE4_EXTERNAL
96 #define FW_PUT_TAP(fname, bitd, opt ) \
97  FW_PUT(fname##4, bitd, opt ) \
98  FW_PUT(fname##8, bitd, opt ) \
99  FW_PUT(fname##16, bitd, opt ) \
100  FW_PUT(fname##32, bitd, opt ) \
101  FW_PUT(fname##64, bitd, opt ) \
102  FW_PUT(fname##128, bitd, opt ) \
103 
104 #define FW_PUT_4TAP(fname, bitd, opt) \
105  FW_PUT(fname ## 2, bitd, opt) \
106  FW_PUT_TAP(fname, bitd, opt)
107 
108 #define FW_PUT_4TAP_SSE4(bitd) \
109  FW_PUT_4TAP(pixels, bitd, sse4) \
110  FW_PUT_4TAP(4tap_h, bitd, sse4) \
111  FW_PUT_4TAP(4tap_v, bitd, sse4) \
112  FW_PUT_4TAP(4tap_hv, bitd, sse4)
113 
114 #define FW_PUT_8TAP_SSE4(bitd) \
115  FW_PUT_TAP(8tap_h, bitd, sse4) \
116  FW_PUT_TAP(8tap_v, bitd, sse4) \
117  FW_PUT_TAP(8tap_hv, bitd, sse4)
118 
119 #define FW_PUT_SSE4(bitd) \
120  FW_PUT_4TAP_SSE4(bitd) \
121  FW_PUT_8TAP_SSE4(bitd)
122 
123 FW_PUT_SSE4( 8)
124 FW_PUT_SSE4(10)
125 FW_PUT_SSE4(12)
126 #endif
127 
128 #if HAVE_AVX2_EXTERNAL
129 #define FW_PUT_TAP_AVX2(n, bitd) \
130  FW_PUT(n ## tap_h32, bitd, avx2) \
131  FW_PUT(n ## tap_h64, bitd, avx2) \
132  FW_PUT(n ## tap_h128, bitd, avx2) \
133  FW_PUT(n ## tap_v32, bitd, avx2) \
134  FW_PUT(n ## tap_v64, bitd, avx2) \
135  FW_PUT(n ## tap_v128, bitd, avx2)
136 
137 #define FW_PUT_AVX2(bitd) \
138  FW_PUT(pixels32, bitd, avx2) \
139  FW_PUT(pixels64, bitd, avx2) \
140  FW_PUT(pixels128, bitd, avx2) \
141  FW_PUT_TAP_AVX2(4, bitd) \
142  FW_PUT_TAP_AVX2(8, bitd) \
143 
144 FW_PUT_AVX2( 8)
145 FW_PUT_AVX2(10)
146 FW_PUT_AVX2(12)
147 
148 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
149  FW_PUT(n ## tap_h16, bitd, avx2) \
150  FW_PUT(n ## tap_v16, bitd, avx2) \
151  FW_PUT(n ## tap_hv16, bitd, avx2) \
152  FW_PUT(n ## tap_hv32, bitd, avx2) \
153  FW_PUT(n ## tap_hv64, bitd, avx2) \
154  FW_PUT(n ## tap_hv128, bitd, avx2)
155 
156 #define FW_PUT_16BPC_AVX2(bitd) \
157  FW_PUT(pixels16, bitd, avx2) \
158  FW_PUT_TAP_16BPC_AVX2(4, bitd) \
159  FW_PUT_TAP_16BPC_AVX2(8, bitd)
160 
161 FW_PUT_16BPC_AVX2(10)
162 FW_PUT_16BPC_AVX2(12)
163 
164 #define ALF_FUNCS(bpc, bd, opt) \
165 static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
166  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
167 { \
168  const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \
169  BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
170  filter, clip, param_stride, vb_pos, (1 << bd) - 1); \
171 } \
172 static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
173  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
174 { \
175  BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
176  filter, clip, 0, vb_pos,(1 << bd) - 1); \
177 } \
178 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
179  const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \
180 { \
181  BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \
182  BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \
183 } \
184 
185 ALF_FUNCS(8, 8, avx2)
186 ALF_FUNCS(16, 10, avx2)
187 ALF_FUNCS(16, 12, avx2)
188 
189 #endif
190 
191 #define SAO_FILTER_FUNC(wd, bitd, opt) \
192 void ff_vvc_sao_band_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
193  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
194 void ff_vvc_sao_edge_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
195  const int16_t *sao_offset_val, int eo, int width, int height); \
196 
197 #define SAO_FILTER_FUNCS(bitd, opt) \
198  SAO_FILTER_FUNC(8, bitd, opt) \
199  SAO_FILTER_FUNC(16, bitd, opt) \
200  SAO_FILTER_FUNC(32, bitd, opt) \
201  SAO_FILTER_FUNC(48, bitd, opt) \
202  SAO_FILTER_FUNC(64, bitd, opt) \
203  SAO_FILTER_FUNC(80, bitd, opt) \
204  SAO_FILTER_FUNC(96, bitd, opt) \
205  SAO_FILTER_FUNC(112, bitd, opt) \
206  SAO_FILTER_FUNC(128, bitd, opt) \
207 
208 SAO_FILTER_FUNCS(8, avx2)
209 SAO_FILTER_FUNCS(10, avx2)
210 SAO_FILTER_FUNCS(12, avx2)
211 
212 #define SAO_FILTER_INIT(type, bitd, opt) do { \
213  c->sao.type##_filter[0] = ff_vvc_sao_##type##_filter_8_##bitd##_##opt; \
214  c->sao.type##_filter[1] = ff_vvc_sao_##type##_filter_16_##bitd##_##opt; \
215  c->sao.type##_filter[2] = ff_vvc_sao_##type##_filter_32_##bitd##_##opt; \
216  c->sao.type##_filter[3] = ff_vvc_sao_##type##_filter_48_##bitd##_##opt; \
217  c->sao.type##_filter[4] = ff_vvc_sao_##type##_filter_64_##bitd##_##opt; \
218  c->sao.type##_filter[5] = ff_vvc_sao_##type##_filter_80_##bitd##_##opt; \
219  c->sao.type##_filter[6] = ff_vvc_sao_##type##_filter_96_##bitd##_##opt; \
220  c->sao.type##_filter[7] = ff_vvc_sao_##type##_filter_112_##bitd##_##opt; \
221  c->sao.type##_filter[8] = ff_vvc_sao_##type##_filter_128_##bitd##_##opt; \
222 } while (0)
223 
224 #define SAO_INIT(bitd, opt) do { \
225  SAO_FILTER_INIT(band, bitd, opt); \
226  SAO_FILTER_INIT(edge, bitd, opt); \
227 } while (0)
228 
229 #define AVG_INIT(bd, opt) do { \
230 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
231  const int16_t *src0, const int16_t *src1, int width, int height);\
232 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
233  const int16_t *src0, const int16_t *src1, int width, int height, \
234  int denom, int w0, int w1, int o0, int o1); \
235  c->inter.avg = bf(ff_vvc_avg, bd, opt); \
236  c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
237 } while (0)
238 
239 #define DMVR_INIT(bd) do { \
240  c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_avx2; \
241  c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_avx2; \
242  c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_avx2; \
243  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \
244 } while (0)
245 
246 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
247  dst[C][W][idx1][idx2] = vvc_put_## name ## _ ## D ## _##opt; \
248  dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
249 
250 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
251  PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
252  PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
253  PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
254  PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
255  PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
256  PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
257 
258 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
259  MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
260 
261 #define MC_8TAP_LINKS_SSE4(bd) \
262  MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
263  MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
264  MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
265  MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
266 
267 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
268  PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
269  MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
270 
271 #define MC_4TAP_LINKS_SSE4(bd) \
272  MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
273  MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
274  MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
275  MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
276 
277 #define MC_LINK_SSE4(bd) \
278  MC_4TAP_LINKS_SSE4(bd) \
279  MC_8TAP_LINKS_SSE4(bd)
280 
281 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
282  PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
283  PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
284  PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
285  PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
286  PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
287  PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
288  PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
289  PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
290  PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
291  } while (0)
292 
293 #define MC_LINKS_AVX2(bd) \
294  MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
295  MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
296 
297 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
298  PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
299  PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
300  PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
301  PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
302  PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
303  PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
304  PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
305  } while (0)
306 
307 #define MC_LINKS_16BPC_AVX2(bd) \
308  MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
309  MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
310 
311 int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
312 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
313 
314 #define ALF_INIT(bd) do { \
315  c->alf.filter[LUMA] = vvc_alf_filter_luma_##bd##_avx2; \
316  c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2; \
317  c->alf.classify = vvc_alf_classify_##bd##_avx2; \
318 } while (0)
319 
320 #endif
321 
322 
323 #endif // ARCH_X86_64
324 
325 av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
326 {
327 #if ARCH_X86_64
328  const int cpu_flags = av_get_cpu_flags();
329 
330  switch (bd) {
331  case 8:
332 #if HAVE_SSE4_EXTERNAL
333  if (EXTERNAL_SSE4(cpu_flags)) {
334  MC_LINK_SSE4(8);
335  }
336 #endif
337 #if HAVE_AVX2_EXTERNAL
339  // inter
340  AVG_INIT(8, avx2);
341  DMVR_INIT(8);
342  MC_LINKS_AVX2(8);
343  OF_INIT(8);
344  SAD_INIT();
345 
346  // filter
347  ALF_INIT(8);
348  SAO_INIT(8, avx2);
349  }
350 #endif
351  break;
352  case 10:
353 #if HAVE_SSE4_EXTERNAL
354  if (EXTERNAL_SSE4(cpu_flags)) {
355  MC_LINK_SSE4(10);
356  }
357 #endif
358 #if HAVE_AVX2_EXTERNAL
360  // inter
361  AVG_INIT(10, avx2);
362  DMVR_INIT(10);
363  MC_LINKS_AVX2(10);
364  MC_LINKS_16BPC_AVX2(10);
365  OF_INIT(10);
366  SAD_INIT();
367 
368  // filter
369  ALF_INIT(10);
370  SAO_INIT(10, avx2);
371  }
372 #endif
373  break;
374  case 12:
375 #if HAVE_SSE4_EXTERNAL
376  if (EXTERNAL_SSE4(cpu_flags)) {
377  MC_LINK_SSE4(12);
378  }
379 #endif
380 #if HAVE_AVX2_EXTERNAL
382  // inter
383  AVG_INIT(12, avx2);
384  DMVR_INIT(12);
385  MC_LINKS_AVX2(12);
386  MC_LINKS_16BPC_AVX2(12);
387  OF_INIT(12);
388  SAD_INIT();
389 
390  // filter
391  ALF_INIT(12);
392  SAO_INIT(12, avx2);
393  }
394 #endif
395  break;
396  default:
397  break;
398  }
399 #endif
400 }
cpu.h
src1
const pixel * src1
Definition: h264pred_template.c:420
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:73
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
DMVR_PROTOTYPES
#define DMVR_PROTOTYPES(bd, opt)
Definition: dsp_init.c:42
av_cold
#define av_cold
Definition: attributes.h:106
dsp.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
cpu.h
DMVR_INIT
#define DMVR_INIT(bd, opt)
Definition: dsp_init.c:55
attributes.h
h2656dsp.h
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:62
src0
const pixel *const src0
Definition: h264pred_template.c:419
w
uint8_t w
Definition: llvidencdsp.c:39
h
h
Definition: vp9dsp_template.c:2070
ctu.h
ff_vvc_dsp_init_x86
av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
Definition: dsp_init.c:325
dec.h
VVCDSPContext
Definition: dsp.h:170