Go to the documentation of this file.
30 #define BDOF_BLOCK_SIZE 16
31 #define BDOF_MIN_BLOCK_SIZE 4
91 const int block_w,
const int block_h);
104 const int16_t *
src0,
const int16_t *
src1,
106 uintptr_t w0_w1, uintptr_t offset_shift);
108 const int16_t *
src0,
const int16_t *
src1,
110 uintptr_t w0_w1, uintptr_t offset_shift);
112 const int16_t *
src0,
const int16_t *
src1,
114 uintptr_t w0_w1, uintptr_t offset_shift);
120 #define W_AVG_FUN(bit_depth) \
121 static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
122 const int16_t *src0, const int16_t *src1, int width, int height, \
123 int denom, int w0, int w1, int o) \
125 int shift = denom + FFMAX(3, 15 - bit_depth); \
126 int offset = (o * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
127 uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
128 uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
129 ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
136 #define DMVR_FUN(fn, bd) \
137 void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
138 const uint8_t *_src, ptrdiff_t _src_stride, int height, \
139 intptr_t mx, intptr_t my, int width);
151 #define APPLY_BDOF_FUNC(bd) \
152 void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \
153 const int16_t *_src0, const int16_t *_src1, \
154 int block_w, int block_h);
167 c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
168 c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
169 c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
170 c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
171 c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
172 c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
174 c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon;
175 c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon;
176 c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon;
177 c->inter.put[0][4][0][1] =
178 c->inter.put[0][5][0][1] =
179 c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
182 c->inter.put[0][2][1][0] =
183 c->inter.put[0][3][1][0] =
184 c->inter.put[0][4][1][0] =
185 c->inter.put[0][5][1][0] =
188 c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon;
189 c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon;
190 c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon;
191 c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon;
192 c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon;
193 c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon;
195 c->inter.put[1][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
196 c->inter.put[1][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
197 c->inter.put[1][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
198 c->inter.put[1][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
199 c->inter.put[1][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
200 c->inter.put[1][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
202 c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon;
203 c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon;
204 c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon;
205 c->inter.put[1][4][0][1] =
206 c->inter.put[1][5][0][1] =
207 c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
209 c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
210 c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
211 c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
212 c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
213 c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
214 c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
216 c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
217 c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
218 c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
219 c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon;
220 c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon;
221 c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon;
223 c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon;
224 c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon;
225 c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon;
226 c->inter.put_uni[0][4][0][1] =
227 c->inter.put_uni[0][5][0][1] =
228 c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon;
230 c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon;
231 c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon;
232 c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon;
233 c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon;
234 c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
235 c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
238 c->inter.w_avg = vvc_w_avg_8;
239 c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
240 c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon;
241 c->inter.dmvr[1][0] = ff_vvc_dmvr_v_8_neon;
242 c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
243 c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
251 c->alf.filter[
LUMA] = alf_filter_luma_8_neon;
252 c->alf.filter[
CHROMA] = alf_filter_chroma_8_neon;
253 c->alf.classify = alf_classify_8_neon;
256 c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
257 c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon_i8mm;
258 c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon_i8mm;
259 c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
260 c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
261 c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
263 c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon_i8mm;
264 c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon_i8mm;
265 c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon_i8mm;
266 c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm;
267 c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm;
268 c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm;
270 c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon_i8mm;
271 c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon_i8mm;
272 c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon_i8mm;
273 c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
274 c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
275 c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
277 c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
278 c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
279 c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
280 c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
281 c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
282 c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
285 c->alf.filter[
LUMA] = alf_filter_luma_8_sme2;
287 }
else if (bd == 10) {
289 c->inter.w_avg = vvc_w_avg_10;
290 c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
291 c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
292 c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
295 c->inter.put[0][4][0][1] =
296 c->inter.put[0][5][0][1] =
302 c->inter.put[0][4][1][0] =
303 c->inter.put[0][5][1][0] =
308 c->inter.put[0][4][1][1] =
309 c->inter.put[0][5][1][1] =
312 c->alf.filter[
LUMA] = alf_filter_luma_10_neon;
313 c->alf.filter[
CHROMA] = alf_filter_chroma_10_neon;
314 c->alf.classify = alf_classify_10_neon;
316 c->alf.filter[
LUMA] = alf_filter_luma_10_sme2;
318 }
else if (bd == 12) {
320 c->inter.w_avg = vvc_w_avg_12;
321 c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
322 c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
323 c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
324 c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
327 c->inter.put[0][4][0][1] =
328 c->inter.put[0][5][0][1] =
333 c->inter.put[0][4][1][1] =
334 c->inter.put[0][5][1][1] =
340 c->inter.put[0][4][1][0] =
341 c->inter.put[0][5][1][0] =
344 c->alf.filter[
LUMA] = alf_filter_luma_12_neon;
345 c->alf.filter[
CHROMA] = alf_filter_chroma_12_neon;
346 c->alf.classify = alf_classify_12_neon;
348 c->alf.filter[
LUMA] = alf_filter_luma_12_sme2;
void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)
void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
static atomic_int cpu_flags
uint8_t ptrdiff_t const uint8_t * _src
void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
#define W_AVG_FUN(bit_depth)
void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
void ff_vvc_put_qpel_v4_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)
void ff_vvc_put_luma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
#define FF_ARRAY_ELEMS(a)
void ff_vvc_put_luma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
static double grad(int hash, double x, double y, double z)
void ff_vvc_put_luma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
void ff_vvc_put_luma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
#define i(width, name, range_min, range_max)
void ff_vvc_avg_8_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
void ff_vvc_put_luma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_avg_10_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps)
uint8_t ptrdiff_t const uint8_t ptrdiff_t int const int8_t * hf
void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
static const int16_t steps[16]
void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
void ff_vvc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
#define have_sme_i16i64(flags)
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, const int block_w, const int block_h)
#define APPLY_BDOF_FUNC(bd)
void ff_vvc_put_luma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
uint8_t ptrdiff_t const uint8_t ptrdiff_t int const int8_t const int8_t * vf
void ff_vvc_put_luma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)