28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
37 uint8_t *dst,
int32_t dst_stride,
41 __m128i mask0, mask1, mask2, mask3, out1, out2;
43 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
44 __m128i filt0, filt1, filt2, filt3;
45 __m128i res0, res1, res2, res3;
52 filt0, filt1, filt2, filt3);
54 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
55 mask3 = __lsx_vaddi_bu(mask0, 6);
57 for (loop_cnt =
height; loop_cnt--;) {
61 src4, src5, src6, src7);
68 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
69 vec3, filt0, res0, res1, res2, res3);
74 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
75 res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
80 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
81 res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
86 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
87 res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
89 DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
91 __lsx_vst(out1, dst, 0);
92 __lsx_vst(out2, dst, 16);
94 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
96 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
98 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
99 vec3, filt0, res0, res1, res2, res3);
100 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
102 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
104 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
105 res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
106 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
108 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
110 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
111 res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
112 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
114 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
116 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
117 res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
119 DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
121 __lsx_vst(out1, dst, 32);
122 __lsx_vst(out2, dst, 48);
129 uint8_t *dst,
int32_t dst_stride,
133 int32_t src_stride_2x = (src_stride << 1);
134 int32_t dst_stride_2x = (dst_stride << 1);
135 int32_t src_stride_4x = (src_stride << 2);
136 int32_t dst_stride_4x = (dst_stride << 2);
137 int32_t src_stride_3x = src_stride_2x + src_stride;
138 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
140 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
141 __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
142 __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
144 __m128i out0_r, out1_r, out2_r, out3_r;
146 src -= src_stride_3x;
148 filt0, filt1, filt2, filt3);
152 src3 = __lsx_vldx(
src, src_stride_3x);
153 src += src_stride_4x;
154 src4 = __lsx_vld(
src, 0);
155 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
156 src += src_stride_3x;
158 src10_r, src32_r, src54_r, src21_r);
159 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
161 for (loop_cnt = (
height >> 2); loop_cnt--;) {
162 src7 = __lsx_vld(
src, 0);
163 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
164 src10 = __lsx_vldx(
src, src_stride_3x);
165 src += src_stride_4x;
167 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
168 src9, src76_r, src87_r, src98_r, src109_r);
169 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
170 filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
171 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
172 src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
173 filt1, out0_r, out1_r, out2_r, out3_r);
174 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
175 src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
176 filt2, out0_r, out1_r, out2_r, out3_r);
177 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
178 src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
179 filt3, out0_r, out1_r, out2_r, out3_r);
181 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
183 __lsx_vstelm_d(tmp0, dst, 0, 0);
184 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
185 __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
186 __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
187 dst += dst_stride_4x;
204 const uint8_t *src_tmp;
206 uint32_t loop_cnt, cnt;
207 const int32_t src_stride_2x = (src_stride << 1);
208 const int32_t dst_stride_2x = (dst_stride << 1);
209 const int32_t src_stride_4x = (src_stride << 2);
210 const int32_t dst_stride_4x = (dst_stride << 2);
211 const int32_t src_stride_3x = src_stride_2x + src_stride;
212 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
214 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
215 __m128i filt0, filt1, filt2, filt3;
216 __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
217 __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
218 __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
219 __m128i tmp0, tmp1, tmp2, tmp3;
220 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
222 src -= src_stride_3x;
223 DUP4_ARG2(__lsx_vldrepl_h,
filter, 0,
filter, 2,
filter, 4,
filter, 6, filt0,
224 filt1, filt2, filt3);
226 for (cnt = (
width >> 4); cnt--;) {
230 src0 = __lsx_vld(src_tmp, 0);
231 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
233 src3 = __lsx_vldx(src_tmp, src_stride_3x);
234 src_tmp += src_stride_4x;
235 src4 = __lsx_vld(src_tmp, 0);
236 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
238 src_tmp += src_stride_3x;
240 src10_r, src32_r, src54_r, src21_r);
241 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
243 src10_l, src32_l, src54_l, src21_l);
244 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
246 for (loop_cnt = (
height >> 2); loop_cnt--;) {
247 src7 = __lsx_vld(src_tmp, 0);
248 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
250 src10 = __lsx_vldx(src_tmp, src_stride_3x);
251 src_tmp += src_stride_4x;
252 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
253 src9, src76_r, src87_r, src98_r, src109_r);
254 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
255 src9, src76_l, src87_l, src98_l, src109_l);
256 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
257 filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
258 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
259 src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
260 filt1, out0_r, out1_r, out2_r, out3_r);
261 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
262 src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
263 filt2, out0_r, out1_r, out2_r, out3_r);
264 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
265 src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
266 filt3, out0_r, out1_r, out2_r, out3_r);
267 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
268 filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
269 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
270 src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
271 filt1, out0_l, out1_l, out2_l, out3_l);
272 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
273 src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
274 filt2, out0_l, out1_l, out2_l, out3_l);
275 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
276 src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
277 filt3, out0_l, out1_l, out2_l, out3_l);
278 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
279 6, out2_l, out2_r, 6, out3_l, out3_r, 6,
280 tmp0, tmp1, tmp2, tmp3);
281 __lsx_vst(tmp0, dst_tmp, 0);
282 __lsx_vstx(tmp1, dst_tmp, dst_stride);
283 __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
284 __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
285 dst_tmp += dst_stride_4x;
308 uint8_t *dst,
int32_t dst_stride,
317 uint8_t *dst,
int32_t dst_stride,
324 uint8_t *dst,
int32_t dst_stride,
331 uint8_t *dst,
int32_t dst_stride,
339 int32_t dst_stride,
const int8_t *filter_x,
342 uint32_t loop_cnt, cnt;
343 const uint8_t *src_tmp;
345 const int32_t src_stride_2x = (src_stride << 1);
346 const int32_t dst_stride_2x = (dst_stride << 1);
347 const int32_t src_stride_4x = (src_stride << 2);
348 const int32_t src_stride_3x = src_stride_2x + src_stride;
351 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
352 __m128i filt0, filt1, filt2, filt3;
353 __m128i filt_h0, filt_h1, filt_h2, filt_h3;
354 __m128i mask1, mask2, mask3;
356 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
357 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
358 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
359 __m128i dst0_r, dst0_l, dst1_r, dst1_l;
360 __m128i dst10_r, dst32_r, dst54_r, dst76_r;
361 __m128i dst10_l, dst32_l, dst54_l, dst76_l;
362 __m128i dst21_r, dst43_r, dst65_r, dst87_r;
363 __m128i dst21_l, dst43_l, dst65_l, dst87_l;
366 src -= (src_stride_3x + 3);
367 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
368 filter_x, 6, filt0, filt1, filt2, filt3);
370 filter_vec = __lsx_vld(filter_y, 0);
371 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
372 DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
373 filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
375 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
376 mask3 = __lsx_vaddi_bu(mask0, 6);
378 for (cnt =
width >> 3; cnt--;) {
382 src0 = __lsx_vld(src_tmp, 0);
383 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
385 src3 = __lsx_vldx(src_tmp, src_stride_3x);
386 src_tmp += src_stride_4x;
387 src4 = __lsx_vld(src_tmp, 0);
388 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
390 src_tmp += src_stride_3x;
394 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
396 src1, mask2,
src1,
src1, mask3, vec4, vec5, vec6, vec7);
398 src2, mask2,
src2,
src2, mask3, vec8, vec9, vec10, vec11);
399 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
400 src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
401 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
402 vec12, filt0, dst0, dst1, dst2, dst3);
403 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
404 dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
405 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
406 dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
407 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
408 dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
410 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
411 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
412 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
413 src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
414 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
415 src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
416 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
417 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
418 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
419 dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
420 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
421 dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
422 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
423 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
424 dst1, dst10_r, dst32_r, dst54_r, dst21_r);
425 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
426 dst1, dst10_l, dst32_l, dst54_l, dst21_l);
427 DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
428 DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
430 for (loop_cnt =
height >> 1; loop_cnt--;) {
431 src7 = __lsx_vld(src_tmp, 0);
432 src8 = __lsx_vldx(src_tmp, src_stride);
433 src_tmp += src_stride_2x;
435 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
436 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
437 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
438 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
440 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
441 dst76_r = __lsx_vilvl_h(dst7, dst6);
442 dst76_l = __lsx_vilvh_h(dst7, dst6);
443 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
445 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
446 dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
447 dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
448 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
449 dst76_l, filt_h3, dst0_r, dst0_l);
450 DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
452 DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
453 src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
454 dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
455 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
457 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
459 dst87_r = __lsx_vilvl_h(dst8, dst7);
460 dst87_l = __lsx_vilvh_h(dst8, dst7);
461 DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
463 DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
464 dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
465 dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
466 DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
467 dst87_l, filt_h3, dst1_r, dst1_l);
468 DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
469 DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
470 6, dst0_r, dst0_l, dst1_r, dst1_l);
471 DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
472 dst0_l, dst0_r, dst1_l, dst1_r);
473 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
475 out = __lsx_vpickev_b(dst1, dst0);
476 __lsx_vstelm_d(
out, dst_tmp, 0, 0);
477 __lsx_vstelm_d(
out, dst_tmp + dst_stride, 0, 1);
478 dst_tmp += dst_stride_2x;
500 int32_t dst_stride,
const int8_t *filter_x,
504 filter_x, filter_y,
height, 8);
508 int32_t dst_stride,
const int8_t *filter_x,
512 filter_x, filter_y,
height, 16);
516 int32_t dst_stride,
const int8_t *filter_x,
520 filter_x, filter_y,
height, 24);
524 int32_t dst_stride,
const int8_t *filter_x,
528 filter_x, filter_y,
height, 32);
532 int32_t dst_stride,
const int8_t *filter_x,
536 filter_x, filter_y,
height, 48);
540 int32_t dst_stride,
const int8_t *filter_x,
544 filter_x, filter_y,
height, 64);
549 uint8_t *dst,
int32_t dst_stride,
553 int32_t src_stride_2x = (src_stride << 1);
554 int32_t src_stride_3x = src_stride_2x + src_stride;
557 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
558 __m128i src11, filt0, filt1;
559 __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
560 __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
561 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
562 __m128i out1, out2, out3, out4;
575 src6 = __lsx_vld(_src, 0);
576 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
577 src += src_stride_3x;
578 _src += src_stride_3x;
579 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
581 for (loop_cnt = 8; loop_cnt--;) {
584 DUP2_ARG2(__lsx_vldx,
src, src_stride, _src, src_stride, src4, src10);
585 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
586 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
589 src += src_stride_2x;
590 _src += src_stride_2x;
591 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
594 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
595 filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
596 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
597 filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
598 out0_r, out0_l, out1_r, out1_l);
601 DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
603 DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
604 src109_r, filt1, out2_r, out3_r);
607 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
608 out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
609 __lsx_vst(out1, dst, 0);
610 __lsx_vstelm_d(out2, dst, 16, 0);
612 __lsx_vst(out4, dst, 0);
613 __lsx_vstelm_d(out3, dst, 16, 0);
619 DUP2_ARG2(__lsx_vilvl_b, src5, src4,
src2, src5, src10_r, src21_r);
620 DUP2_ARG2(__lsx_vilvh_b, src5, src4,
src2, src5, src10_l, src21_l);
623 src += src_stride_2x;
624 _src += src_stride_2x;
625 DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
628 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
629 filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
630 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
631 filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
632 out0_r, out0_l, out1_r, out1_l);
635 DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
637 DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
638 src87_r, filt1, out2_r, out3_r);
641 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
642 out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
644 __lsx_vst(out1, dst, 0);
645 __lsx_vstelm_d(out2, dst, 16, 0);
647 __lsx_vst(out3, dst, 0);
648 __lsx_vstelm_d(out4, dst, 16, 0);
655 uint8_t *dst,
int32_t dst_stride,
659 int32_t src_stride_2x = (src_stride << 1);
660 int32_t dst_stride_2x = (dst_stride << 1);
661 int32_t src_stride_3x = src_stride_2x + src_stride;
664 __m128i
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
665 __m128i src10_r, src32_r, src76_r, src98_r;
666 __m128i src21_r, src43_r, src87_r, src109_r;
667 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
668 __m128i src10_l, src32_l, src76_l, src98_l;
669 __m128i src21_l, src43_l, src87_l, src109_l;
670 __m128i filt0, filt1;
685 src6 = __lsx_vld(_src, 0);
686 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
687 src += src_stride_3x;
688 _src += src_stride_3x;
690 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
691 DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
693 for (loop_cnt = (
height >> 1); loop_cnt--;) {
696 DUP2_ARG2(__lsx_vldx,
src, src_stride, _src, src_stride, src4, src10);
697 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
698 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
701 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
702 filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
703 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
704 filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
705 out0_r, out0_l, out1_r, out1_l);
707 DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
709 __lsx_vst(out1, dst, 0);
710 __lsx_vstx(out2, dst, dst_stride);
719 src += src_stride_2x;
720 _src += src_stride_2x;
721 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
722 DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
725 DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
726 filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
727 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
728 filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
729 out2_r, out2_l, out3_r, out3_l);
732 DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
734 __lsx_vst(out1, dst, 16);
735 __lsx_vst(out2, dst + dst_stride, 16);
737 dst += dst_stride_2x;
749 int32_t dst_stride,
const int8_t *filter_x,
750 const int8_t *filter_y)
752 const int32_t src_stride_2x = (src_stride << 1);
753 const int32_t src_stride_4x = (src_stride << 2);
754 const int32_t src_stride_3x = src_stride_2x + src_stride;
757 __m128i filt0, filt1;
758 __m128i filt_h0, filt_h1, filter_vec;
761 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
762 __m128i dst0, dst1, dst2, dst3, dst4;
763 __m128i dst0_r, dst0_l, dst1_r, dst1_l;
764 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
765 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
766 __m128i out0_r, out1_r;
768 src -= (src_stride + 1);
769 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
771 filter_vec = __lsx_vld(filter_y, 0);
772 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
773 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
775 mask1 = __lsx_vaddi_bu(mask0, 2);
778 src_stride_3x,
src, src_stride_4x,
src1,
src2, src3, src4);
781 mask0,
src1,
src1, mask1, vec0, vec1, vec2, vec3);
783 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
784 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
786 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
787 filt0, dst0, dst1, dst2, dst3);
788 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
789 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
790 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
791 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
792 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
793 dst10_r, dst21_r, dst32_r, dst43_r);
794 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
795 dst10_l, dst21_l, dst32_l, dst43_l);
796 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
797 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
798 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
799 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
800 dst0_r, dst0_l, dst1_r, dst1_l);
801 DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
803 out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
804 __lsx_vstelm_d(
out, dst, 0, 0);
805 __lsx_vstelm_d(
out, dst + dst_stride, 0, 1);
810 int32_t dst_stride,
const int8_t *filter_x,
811 const int8_t *filter_y,
int32_t width8mult)
814 const int32_t src_stride_2x = (src_stride << 1);
815 const int32_t dst_stride_2x = (dst_stride << 1);
816 const int32_t src_stride_4x = (src_stride << 2);
817 const int32_t src_stride_3x = src_stride_2x + src_stride;
818 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
821 __m128i
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
822 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
823 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
824 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
825 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
826 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
827 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
829 src -= (src_stride + 1);
830 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
832 filter_vec = __lsx_vld(filter_y, 0);
833 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
834 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
837 mask1 = __lsx_vaddi_bu(mask0, 2);
839 for (cnt = width8mult; cnt--;) {
842 src3 = __lsx_vldx(
src, src_stride_3x);
843 src += src_stride_4x;
844 src4 = __lsx_vld(
src, 0);
845 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
846 src += (8 - src_stride_4x);
854 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
855 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
856 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
858 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
860 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
861 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
863 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
865 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
867 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
869 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
872 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
873 vec6, filt0, dst3, dst4, dst5, dst6);
874 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
875 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
877 DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
878 dst5, dst32_r, dst43_r, dst54_r, dst65_r);
879 DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
880 dst5, dst32_l, dst43_l, dst54_l, dst65_l);
882 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
883 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
884 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
885 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
886 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
887 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
888 dst0_r, dst0_l, dst1_r, dst1_l);
889 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
890 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
891 dst2_r, dst2_l, dst3_r, dst3_l);
893 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
894 dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
895 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
896 __lsx_vstelm_d(out0, dst, 0, 0);
897 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
898 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
899 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
906 int32_t dst_stride,
const int8_t *filter_x,
907 const int8_t *filter_y)
909 const int32_t src_stride_2x = (src_stride << 1);
910 const int32_t dst_stride_2x = (dst_stride << 1);
911 const int32_t src_stride_4x = (src_stride << 2);
912 const int32_t dst_stride_4x = (dst_stride << 2);
913 const int32_t src_stride_3x = src_stride_2x + src_stride;
914 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
915 __m128i out0, out1, out2;
916 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
917 __m128i filt0, filt1;
918 __m128i filt_h0, filt_h1, filter_vec;
921 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
922 __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
923 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
924 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
925 __m128i dst4_r, dst4_l, dst5_r, dst5_l;
926 __m128i dst10_r, dst32_r, dst10_l, dst32_l;
927 __m128i dst21_r, dst43_r, dst21_l, dst43_l;
928 __m128i dst54_r, dst54_l, dst65_r, dst65_l;
929 __m128i dst76_r, dst76_l, dst87_r, dst87_l;
930 __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
932 src -= (src_stride + 1);
933 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
935 filter_vec = __lsx_vld(filter_y, 0);
936 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
937 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
939 mask1 = __lsx_vaddi_bu(mask0, 2);
943 src_stride_3x,
src, src_stride_4x,
src1,
src2, src3, src4);
944 src += src_stride_4x;
946 src_stride_3x,
src, src_stride_4x, src5, src6, src7, src8);
949 mask0,
src1,
src1, mask1, vec0, vec1, vec2, vec3);
951 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
952 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
953 mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
954 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
955 mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
956 DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
958 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
959 filt0, dst0, dst1, dst2, dst3);
960 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
961 filt0, dst4, dst5, dst6, dst7);
962 dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
963 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
964 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
965 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
966 vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
967 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
969 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
970 dst10_r, dst21_r, dst32_r, dst43_r);
971 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
972 dst10_l, dst21_l, dst32_l, dst43_l);
973 DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
974 dst54_r, dst65_r, dst76_r, dst87_r);
975 DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
976 dst54_l, dst65_l, dst76_l, dst87_l);
978 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
979 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
980 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
981 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
982 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
983 filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
984 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
985 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
986 dst0_r, dst0_l, dst1_r, dst1_l);
987 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
988 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
989 dst2_r, dst2_l, dst3_r, dst3_l);
990 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
991 filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
992 dst4_r, dst4_l, dst5_r, dst5_l);
994 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
995 dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
996 DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
998 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
1000 out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
1002 __lsx_vstelm_d(out0, dst, 0, 0);
1003 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1004 __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1005 __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1006 dst += dst_stride_4x;
1007 __lsx_vstelm_d(out2, dst, 0, 0);
1008 __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
1013 int32_t dst_stride,
const int8_t *filter_x,
1017 uint32_t loop_cnt, cnt;
1018 const uint8_t *src_tmp;
1020 const int32_t src_stride_2x = (src_stride << 1);
1021 const int32_t dst_stride_2x = (dst_stride << 1);
1022 const int32_t src_stride_4x = (src_stride << 2);
1023 const int32_t dst_stride_4x = (dst_stride << 2);
1024 const int32_t src_stride_3x = src_stride_2x + src_stride;
1025 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1029 __m128i filt0, filt1;
1030 __m128i filt_h0, filt_h1, filter_vec;
1033 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1034 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1035 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1036 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1037 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1038 __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
1039 __m128i out0_r, out1_r, out2_r, out3_r;
1041 src -= (src_stride + 1);
1042 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1044 filter_vec = __lsx_vld(filter_y, 0);
1045 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1046 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1047 mask1 = __lsx_vaddi_bu(mask0, 2);
1049 for (cnt = width8mult; cnt--;) {
1053 src0 = __lsx_vld(src_tmp, 0);
1054 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1056 src_tmp += src_stride_3x;
1065 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1066 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1067 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1069 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1071 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1072 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1074 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1075 src3 = __lsx_vld(src_tmp, 0);
1076 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1078 src6 = __lsx_vldx(src_tmp, src_stride_3x);
1079 src_tmp += src_stride_4x;
1081 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1082 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1083 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1084 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1086 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1087 vec6, filt0, dst3, dst4, dst5, dst6);
1088 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
1089 filt1, dst5, vec5, filt1, dst6, vec7, filt1,
1090 dst3, dst4, dst5, dst6);
1092 DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
1093 dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
1094 DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
1095 dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
1097 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1098 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1099 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1100 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1101 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1102 dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
1103 dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
1104 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
1105 dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
1106 dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
1108 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
1109 dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
1111 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
1113 __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1114 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1115 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1116 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1117 dst_tmp += dst_stride_4x;
1132 int32_t dst_stride,
const int8_t *filter_x,
1137 }
else if (4 ==
height) {
1139 filter_x, filter_y, 1);
1140 }
else if (6 ==
height) {
1142 }
else if (0 == (
height & 0x03)) {
1144 filter_x, filter_y,
height, 1);
1150 int32_t dst_stride,
const int8_t *filter_x,
1154 const uint8_t *src_tmp;
1156 const int32_t src_stride_2x = (src_stride << 1);
1157 const int32_t dst_stride_2x = (dst_stride << 1);
1158 const int32_t src_stride_4x = (src_stride << 2);
1159 const int32_t dst_stride_4x = (dst_stride << 2);
1160 const int32_t src_stride_3x = src_stride_2x + src_stride;
1161 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1163 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1164 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1165 __m128i mask0, mask1, mask2, mask3;
1166 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
1167 __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
1168 __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
1169 __m128i dst76_r, dst98_r, dst87_r, dst109_r;
1170 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1171 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1172 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1173 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1175 src -= (src_stride + 1);
1176 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1178 filter_vec = __lsx_vld(filter_y, 0);
1179 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1180 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1183 mask1 = __lsx_vaddi_bu(mask0, 2);
1188 src0 = __lsx_vld(src_tmp, 0);
1189 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1191 src_tmp += src_stride_3x;
1197 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1198 dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1199 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
1201 dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1203 DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
1204 DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
1206 for (loop_cnt = 4; loop_cnt--;) {
1207 src3 = __lsx_vld(src_tmp, 0);
1208 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1210 src6 = __lsx_vldx(src_tmp, src_stride_3x);
1211 src_tmp += src_stride_4x;
1213 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1214 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1215 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1216 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1218 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1219 vec6, filt0, dsth3, dsth4, dsth5, dsth6);
1220 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
1221 vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
1222 dsth3, dsth4, dsth5, dsth6);
1224 DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1225 dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
1226 DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1227 dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
1229 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1230 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1231 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1232 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1233 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1234 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1235 dst0_r, dst0_l, dst1_r, dst1_l);
1236 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1237 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1238 dst2_r, dst2_l, dst3_r, dst3_l);
1240 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
1241 dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
1242 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1244 __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1245 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1246 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1247 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1248 dst_tmp += dst_stride_4x;
1261 mask3 = __lsx_vaddi_bu(mask2, 2);
1265 src += src_stride_3x;
1269 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
1270 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
1273 dst10_r = __lsx_vilvl_h(dst21, dst10);
1274 dst21_r = __lsx_vilvh_h(dst21, dst10);
1275 dst22 = __lsx_vreplvei_d(dst21, 1);
1277 for (loop_cnt = 2; loop_cnt--;) {
1278 src3 = __lsx_vld(
src, 0);
1279 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src4, src5);
1280 src6 = __lsx_vldx(
src, src_stride_3x);
1281 src += src_stride_4x;
1282 src7 = __lsx_vld(
src, 0);
1283 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
1284 src10 = __lsx_vldx(
src, src_stride_3x);
1285 src += src_stride_4x;
1286 DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
1287 src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
1288 DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
1289 src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
1291 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1292 vec6, filt0, dst73, dst84, dst95, dst106);
1293 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
1294 filt1, dst95, vec5, filt1, dst106, vec7, filt1,
1295 dst73, dst84, dst95, dst106);
1297 dst32_r = __lsx_vilvl_h(dst73, dst22);
1298 DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
1299 DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
1300 dst65_r = __lsx_vilvl_h(dst106, dst95);
1301 dst109_r = __lsx_vilvh_h(dst106, dst95);
1302 dst22 = __lsx_vreplvei_d(dst73, 1);
1303 dst76_r = __lsx_vilvl_h(dst22, dst106);
1305 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
1306 filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
1307 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
1308 filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
1309 DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
1310 filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
1311 dst0, dst1, dst2, dst3);
1312 DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
1313 filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
1314 dst4, dst5, dst6, dst7);
1316 DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
1317 6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
1318 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1320 __lsx_vstelm_w(out0, dst, 0, 0);
1321 __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
1322 __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
1323 __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
1324 dst += dst_stride_4x;
1325 __lsx_vstelm_w(out1, dst, 0, 0);
1326 __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
1327 __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
1328 __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
1329 dst += dst_stride_4x;
1333 dst22 = __lsx_vreplvei_d(dst106, 1);
1338 int32_t dst_stride,
const int8_t *filter_x,
1346 filter_x, filter_y,
height, 2);
1351 int32_t dst_stride,
const int8_t *filter_x,
1355 filter_x, filter_y,
height, 3);
1359 int32_t dst_stride,
const int8_t *filter_x,
1363 filter_x, filter_y,
height, 4);
1366 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
1367 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
1368 ptrdiff_t dst_stride, \
1369 const uint8_t *src, \
1370 ptrdiff_t src_stride, \
1376 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
1378 common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1382 UNI_MC(qpel,
h, 64, 8, hz, mx);
1384 UNI_MC(qpel, v, 24, 8, vt, my);
1385 UNI_MC(qpel, v, 32, 8, vt, my);
1386 UNI_MC(qpel, v, 48, 8, vt, my);
1387 UNI_MC(qpel, v, 64, 8, vt, my);
1389 UNI_MC(epel, v, 24, 4, vt, my);
1390 UNI_MC(epel, v, 32, 4, vt, my);
1394 #define UNI_MC_HV(PEL, WIDTH, TAP) \
1395 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
1396 ptrdiff_t dst_stride, \
1397 const uint8_t *src, \
1398 ptrdiff_t src_stride, \
1404 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
1405 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
1407 hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1408 filter_x, filter_y, height); \