26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 {-6, 123, 12, -1, 0, 0, 0, 0},
35 {2, -11, 108, 36, -8, 1, 0, 0},
36 {-9, 93, 50, -6, 0, 0, 0, 0},
37 {3, -16, 77, 77, -16, 3, 0, 0},
38 {-6, 50, 93, -9, 0, 0, 0, 0},
39 {1, -8, 36, 108, -11, 2, 0, 0},
40 {-1, 12, 123, -6, 0, 0, 0, 0},
43 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
47 out0_m = __lsx_vdp2_h_b(in0, coeff0); \
48 out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \
49 out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \
54 #define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
57 DUP2_ARG3(__lsx_vshuf_b, in1, in0, mask0, in3, in2, mask1, \
59 out2 = __lsx_vshuf_b(in5, in4, mask2); \
62 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
63 filt_h0, filt_h1, filt_h2) \
65 __m128i vec0_m, vec1_m, vec2_m; \
68 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
69 vec0_m, vec1_m, vec2_m); \
70 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
71 filt_h0, filt_h1, filt_h2); \
73 hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \
74 hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
79 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
80 mask0, mask1, mask2, \
81 filt0, filt1, filt2, \
82 out0, out1, out2, out3) \
84 __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
86 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, \
87 mask0, src3, src3, mask0, vec0_m, vec1_m, vec2_m, vec3_m); \
88 DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
89 vec3_m, filt0, out0, out1, out2, out3); \
90 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, \
91 mask1, src3, src3, mask1, vec0_m, vec1_m, vec2_m, vec3_m); \
92 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, \
93 mask2, src3, src3, mask2, vec4_m, vec5_m, vec6_m, vec7_m); \
94 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
95 out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, out3); \
96 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
97 out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, out3); \
100 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
104 tmp0 = __lsx_vdp2_h_b(vec0, filt0); \
105 tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \
110 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
112 __m128i vec0_m, vec1_m; \
114 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, \
116 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
118 hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \
119 hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
125 const uint8_t *
src, ptrdiff_t src_stride,
131 __m128i mask0, mask1, mask2;
132 __m128i out0, out1, out2, out3;
134 ptrdiff_t src_stride2 = src_stride << 1;
135 ptrdiff_t src_stride3 = src_stride2 + src_stride;
136 ptrdiff_t src_stride4 = src_stride2 << 1;
143 filt2 = __lsx_vldrepl_h(
filter, 4);
145 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
153 filt0, filt1, filt2, out0, out1, out2, out3);
155 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
156 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
157 __lsx_vstelm_d(out0,
dst, 0, 0);
159 __lsx_vstelm_d(out0,
dst, 0, 1);
161 __lsx_vstelm_d(out1,
dst, 0, 0);
163 __lsx_vstelm_d(out1,
dst, 0, 1);
166 for (loop_cnt = (
height >> 2) - 1; loop_cnt--;) {
173 filt0, filt1, filt2, out0, out1, out2, out3);
175 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
176 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
178 __lsx_vstelm_d(out0,
dst, 0, 0);
180 __lsx_vstelm_d(out0,
dst, 0, 1);
182 __lsx_vstelm_d(out1,
dst, 0, 0);
184 __lsx_vstelm_d(out1,
dst, 0, 1);
190 const uint8_t *
src, ptrdiff_t src_stride,
195 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, filt0, filt1;
196 __m128i filt2, mask0, mask1, mask2;
197 __m128i out0, out1, out2, out3, out4, out5, out6, out7;
199 ptrdiff_t src_stride2 = src_stride << 1;
200 ptrdiff_t src_stride3 = src_stride2 + src_stride;
201 ptrdiff_t src_stride4 = src_stride2 << 1;
207 filt2 = __lsx_vldrepl_h(
filter, 4);
209 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
211 for (loop_cnt = (
height >> 2); loop_cnt--;) {
215 8,
src + src_stride3, 8,
src1, src3, src5, src7);
219 DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128,
220 src4, src5, src6, src7);
224 filt0, filt1, filt2, out0, out1, out2, out3);
226 filt0, filt1, filt2, out4, out5, out6, out7);
227 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
228 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
229 __lsx_vst(out0,
dst, 0);
231 __lsx_vst(out1,
dst, 0);
234 DUP2_ARG3(__lsx_vssrarni_b_h, out5, out4, 7, out7, out6, 7, out4, out5);
235 DUP2_ARG2(__lsx_vxori_b, out4, 128, out5, 128, out4, out5);
236 __lsx_vst(out4,
dst, 0);
238 __lsx_vst(out5,
dst, 0);
244 const uint8_t *
src, ptrdiff_t src_stride,
249 __m128i
src0,
src1,
src2, src3, src4, src7, src8, src9, src10;
250 __m128i src10_l, src32_l, src76_l, src98_l, src21_l, src43_l, src87_l;
251 __m128i src109_l, filt0, filt1, filt2;
252 __m128i out0_l, out1_l, out2_l, out3_l;
254 ptrdiff_t src_stride2 = src_stride << 1;
255 ptrdiff_t src_stride3 = src_stride2 + src_stride;
256 ptrdiff_t src_stride4 = src_stride2 << 1;
260 filt2 = __lsx_vldrepl_h(
filter, 4);
265 src4 = __lsx_vld(
src, 0);
270 src4 = __lsx_vxori_b(src4, 128);
273 src3, src10_l, src32_l, src21_l, src43_l);
274 for (loop_cnt = (
height >> 2); loop_cnt--;) {
276 0,
src + src_stride3, 0, src7, src8, src9, src10);
277 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
278 128, src7, src8, src9, src10);
281 DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10,
282 src9, src76_l, src87_l, src98_l, src109_l);
284 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src76_l, filt0, filt1, filt2);
285 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src87_l, filt0, filt1, filt2);
286 out2_l =
DPADD_SH3_SH(src32_l, src76_l, src98_l, filt0, filt1, filt2);
287 out3_l =
DPADD_SH3_SH(src43_l, src87_l, src109_l, filt0, filt1, filt2);
289 DUP2_ARG3(__lsx_vssrarni_b_h, out1_l, out0_l, 7, out3_l, out2_l, 7,
291 DUP2_ARG2(__lsx_vxori_b, out0_l, 128, out1_l, 128, out0_l, out1_l);
293 __lsx_vstelm_d(out0_l,
dst, 0, 0);
295 __lsx_vstelm_d(out0_l,
dst, 0, 1);
297 __lsx_vstelm_d(out1_l,
dst, 0, 0);
299 __lsx_vstelm_d(out1_l,
dst, 0, 1);
311 const uint8_t *
src, ptrdiff_t src_stride,
316 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
317 __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l, src65_l, src87_l;
318 __m128i src10_h, src32_h, src54_h, src76_h, src21_h, src43_h, src65_h, src87_h;
319 __m128i filt0, filt1, filt2;
320 __m128i tmp0, tmp1, tmp2, tmp3;
322 ptrdiff_t src_stride2 = src_stride << 1;
323 ptrdiff_t src_stride3 = src_stride2 + src_stride;
324 ptrdiff_t src_stride4 = src_stride2 << 1;
327 filt2 = __lsx_vldrepl_h(
filter, 4);
331 src4 = __lsx_vld(
src + src_stride2, 0);
334 DUP4_ARG2(__lsx_vxori_b,
src0, 128,
src1, 128,
src2, 128, src3, 128,
src0,
336 src4 = __lsx_vxori_b(src4, 128);
339 src10_l, src32_l, src43_l, src21_l);
341 src10_h, src32_h, src43_h, src21_h);
343 for (loop_cnt = (
height >> 2); loop_cnt--;) {
345 src + src_stride3, 0, src5, src6, src7, src8);
347 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
348 src5, src6, src7, src8);
350 DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
351 src54_l, src65_l, src76_l, src87_l);
352 DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
353 src54_h, src65_h, src76_h, src87_h);
355 tmp0 =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
356 tmp1 =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
357 tmp2 =
DPADD_SH3_SH(src10_h, src32_h, src54_h, filt0, filt1, filt2);
358 tmp3 =
DPADD_SH3_SH(src21_h, src43_h, src65_h, filt0, filt1, filt2);
360 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
361 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
362 __lsx_vst(tmp0,
dst, 0);
364 __lsx_vst(tmp1,
dst, 0);
367 tmp0 =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
368 tmp1 =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
369 tmp2 =
DPADD_SH3_SH(src32_h, src54_h, src76_h, filt0, filt1, filt2);
370 tmp3 =
DPADD_SH3_SH(src43_h, src65_h, src87_h, filt0, filt1, filt2);
372 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
373 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
374 __lsx_vst(tmp0,
dst, 0);
376 __lsx_vst(tmp1,
dst, 0);
392 const uint8_t *
src, ptrdiff_t src_stride,
398 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
399 __m128i filt_hz0, filt_hz1, filt_hz2;
400 __m128i mask0, mask1, mask2, filt_vt0, filt_vt1, filt_vt2;
401 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
402 __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
403 __m128i tmp0, tmp1, tmp2, tmp3;
405 ptrdiff_t src_stride2 = src_stride << 1;
406 ptrdiff_t src_stride3 = src_stride2 + src_stride;
407 ptrdiff_t src_stride4 = src_stride2 << 1;
410 src -= (2 + src_stride2);
413 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
414 filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
416 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
421 src4 = __lsx_vld(
src, 0);
426 src4 = __lsx_vxori_b(src4, 128);
439 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
440 filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
442 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
443 DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
444 for (loop_cnt = (
height >> 2); loop_cnt--;) {
446 src + src_stride3, 0, src5, src6, src7, src8);
449 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
450 src5, src6, src7, src8);
454 out2 = __lsx_vpackev_b(hz_out5, hz_out4);
455 tmp0 =
DPADD_SH3_SH(out0, out1, out2,filt_vt0, filt_vt1, filt_vt2);
459 out5 = __lsx_vpackev_b(hz_out6, hz_out5);
460 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
465 out7 = __lsx_vpackev_b(hz_out7, hz_out6);
466 tmp2 =
DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
470 out6 = __lsx_vpackev_b(hz_out8, hz_out7);
471 tmp3 =
DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
473 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
474 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
475 __lsx_vstelm_d(tmp0,
dst, 0, 0);
478 __lsx_vstelm_d(tmp0,
dst, 0, 1);
480 __lsx_vstelm_d(tmp1,
dst, 0, 0);
482 __lsx_vstelm_d(tmp1,
dst, 0, 1);
494 const uint8_t *
src, ptrdiff_t src_stride,
499 for (multiple8_cnt = 2; multiple8_cnt--;) {
507 const uint8_t *
src, ptrdiff_t src_stride,
513 __m128i src10_l, src72_l, src98_l, src21_l, src87_l, src109_l, filt0, filt1;
514 __m128i out0, out1, out2, out3;
516 ptrdiff_t src_stride2 = src_stride << 1;
517 ptrdiff_t src_stride3 = src_stride2 + src_stride;
518 ptrdiff_t src_stride4 = src_stride2 << 1;
524 src2 = __lsx_vld(
src + src_stride2, 0);
531 for (loop_cnt = (
height >> 2); loop_cnt--;) {
533 src + src_stride3, 0, src7, src8, src9, src10);
536 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
537 src7, src8, src9, src10);
538 DUP4_ARG2(__lsx_vilvl_b, src7,
src2, src8, src7, src9, src8, src10, src9,
539 src72_l, src87_l, src98_l, src109_l);
545 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
546 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
548 __lsx_vstelm_d(out0,
dst, 0, 0);
550 __lsx_vstelm_d(out0,
dst, 0, 1);
552 __lsx_vstelm_d(out1,
dst, 0, 0);
554 __lsx_vstelm_d(out1,
dst, 0, 1);
564 const uint8_t *
src, ptrdiff_t src_stride,
570 __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l, src10_h;
571 __m128i src32_h, src54_h, src21_h, src43_h, src65_h, filt0, filt1;
572 __m128i tmp0, tmp1, tmp2, tmp3;
574 ptrdiff_t src_stride2 = src_stride << 1;
575 ptrdiff_t src_stride3 = src_stride2 + src_stride;
576 ptrdiff_t src_stride4 = src_stride2 << 1;
581 src2 = __lsx_vld(
src + src_stride2, 0);
589 for (loop_cnt = (
height >> 2); loop_cnt--;) {
591 0,
src + src_stride3, 0, src3, src4, src5, src6);
594 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
595 src3, src4, src5, src6);
596 DUP4_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src5, src4, src6,
597 src5, src32_l, src43_l, src54_l, src65_l);
598 DUP4_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src5, src4, src6,
599 src5, src32_h, src43_h, src54_h, src65_h);
605 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
606 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
608 __lsx_vst(tmp0,
dst, 0);
610 __lsx_vst(tmp1,
dst, 0);
617 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
618 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
620 __lsx_vst(tmp0,
dst, 0);
622 __lsx_vst(tmp1,
dst, 0);
634 const uint8_t *
src, ptrdiff_t src_stride,
641 __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
642 __m128i filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
643 __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
645 ptrdiff_t src_stride2 = src_stride << 1;
646 ptrdiff_t src_stride3 = src_stride2 + src_stride;
647 ptrdiff_t src_stride4 = src_stride2 << 1;
650 src -= (2 + src_stride);
653 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
654 filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
656 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
659 src2 = __lsx_vld(
src + src_stride2, 0);
670 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
672 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
674 for (loop_cnt = (
height >> 2); loop_cnt--;) {
676 src + src_stride3, 0, src3, src4, src5, src6);
679 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
680 src3, src4, src5, src6);
684 vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
689 vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
694 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
699 DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
702 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
703 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
705 __lsx_vstelm_d(tmp0,
dst, 0, 0);
707 __lsx_vstelm_d(tmp0,
dst, 0, 1);
709 __lsx_vstelm_d(tmp1,
dst, 0, 0);
711 __lsx_vstelm_d(tmp1,
dst, 0, 1);
717 const uint8_t *
src, ptrdiff_t src_stride,
722 for (multiple8_cnt = 2; multiple8_cnt--;) {
731 const uint8_t *
src, ptrdiff_t src_stride,
737 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
738 __m128i filt_hz0, filt_hz1, mask0, mask1;
739 __m128i filt_vt0, filt_vt1, filt_vt2;
740 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
741 __m128i out0, out1, out2, out3, out4, out5, out6, out7;
743 ptrdiff_t src_stride2 = src_stride << 1;
744 ptrdiff_t src_stride3 = src_stride2 + src_stride;
745 ptrdiff_t src_stride4 = src_stride2 << 1;
748 src -= (1 + src_stride2);
751 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
752 mask1 = __lsx_vaddi_bu(mask0, 2);
757 src4 = __lsx_vld(
src, 0);
762 src4 = __lsx_vxori_b(src4, 128);
770 DUP4_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp2, tmp1,
771 tmp4, tmp3, out0, out1, out3, out4);
773 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
774 filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
776 for (loop_cnt = (
height >> 2); loop_cnt--;) {
778 src + src_stride3, 0, src5, src6, src7, src8);
781 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
782 src5, src6, src7, src8);
785 out2 = __lsx_vpackev_b(tmp5, tmp4);
786 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
789 out5 = __lsx_vpackev_b(tmp6, tmp5);
790 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
793 out6 = __lsx_vpackev_b(tmp7, tmp6);
794 tmp2 =
DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
797 out7 = __lsx_vpackev_b(tmp8, tmp7);
798 tmp3 =
DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
800 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
801 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
803 __lsx_vstelm_d(tmp0,
dst, 0, 0);
805 __lsx_vstelm_d(tmp0,
dst, 0, 1);
807 __lsx_vstelm_d(tmp1,
dst, 0, 0);
809 __lsx_vstelm_d(tmp1,
dst, 0, 1);
821 const uint8_t *
src, ptrdiff_t src_stride,
826 for (multiple8_cnt = 2; multiple8_cnt--;) {
835 const uint8_t *
src, ptrdiff_t src_stride,
841 ptrdiff_t src_stride2 = src_stride << 1;
842 ptrdiff_t src_stride3 = src_stride2 + src_stride;
843 ptrdiff_t src_stride4 = src_stride2 << 1;
846 for (cnt =
height >> 3; cnt--;) {
851 __lsx_vstelm_d(
src0,
dst, 0, 0);
853 __lsx_vstelm_d(
src1,
dst, 0, 0);
855 __lsx_vstelm_d(
src2,
dst, 0, 0);
857 __lsx_vstelm_d(src3,
dst, 0, 0);
864 __lsx_vstelm_d(
src0,
dst, 0, 0);
866 __lsx_vstelm_d(
src1,
dst, 0, 0);
868 __lsx_vstelm_d(
src2,
dst, 0, 0);
870 __lsx_vstelm_d(src3,
dst, 0, 0);
873 }
else if( 0 ==
height % 4) {
874 for (cnt = (
height >> 2); cnt--;) {
879 __lsx_vstelm_d(
src0,
dst, 0, 0);
881 __lsx_vstelm_d(
src1,
dst, 0, 0);
883 __lsx_vstelm_d(
src2,
dst, 0, 0);
885 __lsx_vstelm_d(src3,
dst, 0, 0);
892 const uint8_t *
src, ptrdiff_t src_stride,
897 const uint8_t *src_tmp;
901 ptrdiff_t src_stride2 = src_stride << 1;
902 ptrdiff_t src_stride3 = src_stride2 + src_stride;
903 ptrdiff_t src_stride4 = src_stride2 << 1;
905 ptrdiff_t dst_stride2 = dst_stride << 1;
906 ptrdiff_t dst_stride3 = dst_stride2 + dst_stride;
907 ptrdiff_t dst_stride4 = dst_stride2 << 1;
910 for (cnt = (
width >> 4); cnt--;) {
913 for (loop_cnt = (
height >> 3); loop_cnt--;) {
914 DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
915 src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
916 src4, src5, src6, src7);
917 src_tmp += src_stride4;
919 __lsx_vst(src4, dst_tmp, 0);
920 __lsx_vst(src5, dst_tmp + dst_stride, 0);
921 __lsx_vst(src6, dst_tmp + dst_stride2, 0);
922 __lsx_vst(src7, dst_tmp + dst_stride3, 0);
923 dst_tmp += dst_stride4;
925 DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
926 src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
927 src4, src5, src6, src7);
928 src_tmp += src_stride4;
930 __lsx_vst(src4, dst_tmp, 0);
931 __lsx_vst(src5, dst_tmp + dst_stride, 0);
932 __lsx_vst(src6, dst_tmp + dst_stride2, 0);
933 __lsx_vst(src7, dst_tmp + dst_stride3, 0);
934 dst_tmp += dst_stride4;
939 }
else if (0 ==
height % 4) {
940 for (cnt = (
height >> 2); cnt--;) {
943 src += 4 * src_stride4;
946 __lsx_vst(
src1,
dst + dst_stride, 0);
947 __lsx_vst(
src2,
dst + dst_stride2, 0);
948 __lsx_vst(src3,
dst + dst_stride3, 0);