27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(
src, src_stride, out0, out1);
117 LD4(
src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3,
dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(
src, src_stride, out0, out1);
125 }
else if (0 == (
height % 8)) {
126 for (cnt = (
height >> 3); cnt--;) {
127 LD4(
src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(
src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3,
dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7,
dst, dst_stride);
134 dst += (4 * dst_stride);
136 }
else if (0 == (
height % 4)) {
137 for (cnt = (
height >> 2); cnt--;) {
138 LD4(
src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3,
dst, dst_stride);
141 dst += (4 * dst_stride);
153 src += (8 * src_stride);
155 dst += (8 * dst_stride);
169 src += (8 * src_stride);
171 dst += (8 * dst_stride);
173 src += (4 * src_stride);
175 dst += (4 * dst_stride);
176 }
else if (0 == (
height % 8)) {
177 for (cnt = (
height >> 3); cnt--;) {
180 src += (8 * src_stride);
183 dst += (8 * dst_stride);
185 }
else if (0 == (
height % 4)) {
186 for (cnt = (
height >> 2); cnt--;) {
188 src += (4 * src_stride);
191 dst += (4 * dst_stride);
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
206 LD4(
src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(
src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
212 SD4(out0, out1, out2, out3,
dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7,
dst + 16, dst_stride);
215 dst += (4 * dst_stride);
226 for (cnt = (
height >> 2); cnt--;) {
228 LD_UB4(
src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
231 ST_UB4(src4, src5, src6, src7,
dst + 16, dst_stride);
232 dst += (4 * dst_stride);
241 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (
height >> 2); cnt--;) {
246 LD_UB4(
src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(
src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
251 ST_UB4(src4, src5, src6, src7,
dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11,
dst + 32, dst_stride);
253 dst += (4 * dst_stride);
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (
height >> 2); cnt--;) {
270 LD_UB4(
src, 16, src8, src9, src10, src11);
272 LD_UB4(
src, 16, src12, src13, src14, src15);
279 ST_UB4(src8, src9, src10, src11,
dst, 16);
281 ST_UB4(src12, src13, src14, src15,
dst, 16);
290 v16u8 mask0, mask1, mask2, mask3,
out;
291 v16i8
src0,
src1,
src2, src3, filt0, filt1, filt2, filt3;
292 v8i16
filt, out0, out1;
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
319 v16i8 filt0, filt1, filt2, filt3;
321 v16u8 mask0, mask1, mask2, mask3,
out;
322 v8i16
filt, out0, out1, out2, out3;
337 src += (4 * src_stride);
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
349 ST_W4(
out, 0, 1, 2, 3,
dst + 4 * dst_stride, dst_stride);
356 v16u8 mask0, mask1, mask2, mask3,
out;
357 v16i8
src0,
src1,
src2, src3, filt0, filt1, filt2, filt3;
358 v8i16
filt, out0, out1, out2, out3;
373 src += (4 * src_stride);
375 mask3, filt0, filt1, filt2, filt3, out0, out1);
378 src += (4 * src_stride);
380 mask3, filt0, filt1, filt2, filt3, out2, out3);
386 ST_W4(
out, 0, 1, 2, 3,
dst + 4 * dst_stride, dst_stride);
387 dst += (8 * dst_stride);
391 src += (4 * src_stride);
393 mask3, filt0, filt1, filt2, filt3, out0, out1);
396 src += (4 * src_stride);
398 mask3, filt0, filt1, filt2, filt3, out2, out3);
405 ST_W4(
out, 0, 1, 2, 3,
dst + 4 * dst_stride, dst_stride);
416 }
else if (16 ==
height) {
426 v16i8
src0,
src1,
src2, src3, filt0, filt1, filt2, filt3;
427 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429 v8i16
filt, out0, out1, out2, out3;
442 for (loop_cnt = (
height >> 2); loop_cnt--;) {
445 src += (4 * src_stride);
449 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450 out0, out1, out2, out3);
453 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454 out0, out1, out2, out3);
457 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458 out0, out1, out2, out3);
461 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462 out0, out1, out2, out3);
468 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
469 dst += (4 * dst_stride);
478 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479 v16u8 tmp0, tmp1, tmp2;
481 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482 v16i8 filt0, filt1, filt2, filt3;
483 v8i16
filt, out0, out1, out2, out3, out4, out5;
501 for (loop_cnt = 4; loop_cnt--;) {
505 LD_SB4(
src + 8, src_stride, src4, src5, src6, src7);
509 src += (4 * src_stride);
513 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
517 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
525 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
529 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
531 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
533 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
535 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
546 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
547 ST_W4(tmp2, 0, 1, 2, 3,
dst + 8, dst_stride);
548 dst += (4 * dst_stride);
557 v16u8 mask0, mask1, mask2, mask3,
out;
559 v16i8 filt0, filt1, filt2, filt3;
560 v8i16
filt, out0, out1, out2, out3;
573 for (loop_cnt = (
height >> 2); loop_cnt--;) {
576 src += (2 * src_stride);
580 src += (2 * src_stride);
585 mask3, filt0, filt1, filt2, filt3, out0,
597 mask3, filt0, filt1, filt2, filt3, out0,
615 v16i8
src0,
src1,
src2, src3, filt0, filt1, filt2, filt3;
616 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
619 v8i16 out0, out1, out2, out3, out8, out9,
filt;
636 for (loop_cnt = 16; loop_cnt--;) {
640 src += (2 * src_stride);
644 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
650 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651 out0, out8, out2, out9);
656 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657 out0, out8, out2, out9);
662 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663 out0, out8, out2, out9);
685 v16u8 mask0, mask1, mask2, mask3,
out;
687 v16i8 filt0, filt1, filt2, filt3;
688 v8i16
filt, out0, out1, out2, out3;
701 for (loop_cnt = (
height >> 1); loop_cnt--;) {
717 mask3, filt0, filt1, filt2, filt3, out0,
729 mask3, filt0, filt1, filt2, filt3, out0,
746 v16i8
src0,
src1,
src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
748 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
749 v8i16
filt, out0, out1, out2, out3;
766 for (loop_cnt = 64; loop_cnt--;) {
775 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
779 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
783 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
787 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
792 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
795 out3 = __msa_srari_h(out2, 6);
800 VSHF_B3_SB(
src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
802 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803 VSHF_B3_SB(
src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
806 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807 VSHF_B3_SB(
src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
810 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(
src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
817 out2 = __msa_srari_h(out2, 6);
832 v16u8 mask0, mask1, mask2, mask3,
out;
834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835 v16i8 filt0, filt1, filt2, filt3;
836 v8i16 res0, res1, res2, res3,
filt;
849 for (loop_cnt =
height; loop_cnt--;) {
857 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
865 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
869 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
879 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
883 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
887 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
891 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
911 uint32_t res = (
height & 0x07) >> 1;
913 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
914 v16i8 src11, src12, src13, src14;
915 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
916 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
917 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
918 v16i8 src10998, filt0, filt1, filt2, filt3;
919 v8i16
filt, out10, out32, out54, out76;
921 src -= (3 * src_stride);
927 src += (7 * src_stride);
929 ILVR_B4_SB(
src1,
src0, src3,
src2, src5, src4,
src2,
src1, src10_r, src32_r,
931 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
932 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
936 for (loop_cnt = (
height >> 3); loop_cnt--;) {
937 LD_SB4(
src, src_stride, src7, src8, src9, src10);
938 src += (4 * src_stride);
939 LD_SB4(
src, src_stride, src11, src12, src13, src14);
940 src += (4 * src_stride);
942 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
943 src87_r, src98_r, src109_r);
944 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
945 src1110_r, src1211_r, src1312_r, src1413_r);
946 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
947 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
948 src12111110, src14131312);
952 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
953 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
954 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
955 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
956 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
957 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
958 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
959 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
966 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
967 dst += (8 * dst_stride);
970 src4332 = src12111110;
971 src6554 = src14131312;
976 src += 2 * src_stride;
977 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
978 src8776 = (v16i8)__msa_ilvr_d((v2i64) src87_r, (v2i64) src76_r);
979 src8776 = (v16i8)__msa_xori_b(src8776, 128);
980 out10 = (v8i16)__msa_dotp_s_h((v16i8) src2110, (v16i8) filt0);
981 out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src4332, filt1);
982 out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src6554, filt2);
983 out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src8776, filt3);
984 out10 = (v8i16)__msa_srari_h((v8i16) out10, 6);
985 out10 = (v8i16)__msa_sat_s_h((v8i16) out10, 7);
986 out0 = (v16u8)__msa_pckev_b((v16i8) out10, (v16i8) out10);
987 out0 = (v16u8)__msa_xori_b((v16u8) out0, 128);
989 dst += 2 * dst_stride;
1002 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1003 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1004 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1006 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
1008 src -= (3 * src_stride);
1015 src += (7 * src_stride);
1016 ILVR_B4_SB(
src1,
src0, src3,
src2, src5, src4,
src2,
src1, src10_r, src32_r,
1018 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1020 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1021 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1023 src += (4 * src_stride);
1025 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1026 src87_r, src98_r, src109_r);
1027 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1028 filt0, out0_r, out1_r, out2_r, out3_r);
1029 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1030 filt1, out0_r, out1_r, out2_r, out3_r);
1031 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1032 filt2, out0_r, out1_r, out2_r, out3_r);
1033 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1034 filt3, out0_r, out1_r, out2_r, out3_r);
1036 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1039 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
1040 dst += (4 * dst_stride);
1057 uint32_t out2, out3;
1058 uint64_t out0, out1;
1059 v16u8 tmp0, tmp1, tmp2, tmp3;
1060 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1061 v16i8 filt0, filt1, filt2, filt3;
1062 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1063 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1064 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1065 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1067 src -= (3 * src_stride);
1073 src += (7 * src_stride);
1077 ILVR_B4_SB(
src1,
src0, src3,
src2, src5, src4,
src2,
src1, src10_r, src32_r,
1079 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1080 ILVL_B4_SB(
src1,
src0, src3,
src2, src5, src4,
src2,
src1, src10_l, src32_l,
1082 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1084 for (loop_cnt = 4; loop_cnt--;) {
1085 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1087 src += (4 * src_stride);
1089 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1090 src87_r, src98_r, src109_r);
1091 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1092 src87_l, src98_l, src109_l);
1094 filt1, filt2, filt3);
1096 filt1, filt2, filt3);
1098 filt1, filt2, filt3);
1100 filt1, filt2, filt3);
1102 filt1, filt2, filt3);
1104 filt1, filt2, filt3);
1106 filt1, filt2, filt3);
1108 filt1, filt2, filt3);
1111 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1112 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1113 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1114 out3_r, tmp0, tmp1, tmp2, tmp3);
1117 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1118 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1119 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1120 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1122 SW(out2, (
dst + 8));
1125 SW(out3, (
dst + 8));
1127 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1128 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1129 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1130 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1132 SW(out2, (
dst + 8));
1135 SW(out3, (
dst + 8));
1159 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1160 v16i8 filt0, filt1, filt2, filt3;
1161 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1162 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1163 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1164 v16u8 tmp0, tmp1, tmp2, tmp3;
1165 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1167 src -= (3 * src_stride);
1174 src += (7 * src_stride);
1175 ILVR_B4_SB(
src1,
src0, src3,
src2, src5, src4,
src2,
src1, src10_r, src32_r,
1177 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1178 ILVL_B4_SB(
src1,
src0, src3,
src2, src5, src4,
src2,
src1, src10_l, src32_l,
1180 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1182 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1183 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1185 src += (4 * src_stride);
1187 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1188 src87_r, src98_r, src109_r);
1189 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1190 src87_l, src98_l, src109_l);
1192 filt1, filt2, filt3);
1194 filt1, filt2, filt3);
1196 filt1, filt2, filt3);
1198 filt1, filt2, filt3);
1200 filt1, filt2, filt3);
1202 filt1, filt2, filt3);
1204 filt1, filt2, filt3);
1206 filt1, filt2, filt3);
1209 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1210 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1211 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1212 out3_r, tmp0, tmp1, tmp2, tmp3);
1214 ST_UB4(tmp0, tmp1, tmp2, tmp3,
dst, dst_stride);
1215 dst += (4 * dst_stride);
1238 const uint8_t *src_tmp;
1240 uint32_t loop_cnt, cnt;
1241 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1242 v16i8 filt0, filt1, filt2, filt3;
1243 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1244 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1245 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1246 v16u8 tmp0, tmp1, tmp2, tmp3;
1247 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1249 src -= (3 * src_stride);
1254 for (cnt = (
width >> 4); cnt--;) {
1260 src_tmp += (7 * src_stride);
1262 src32_r, src54_r, src21_r);
1263 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1265 src32_l, src54_l, src21_l);
1266 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1268 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1269 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1271 src_tmp += (4 * src_stride);
1272 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1273 src87_r, src98_r, src109_r);
1274 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1275 src87_l, src98_l, src109_l);
1277 filt0, filt1, filt2, filt3);
1279 filt0, filt1, filt2, filt3);
1281 filt0, filt1, filt2, filt3);
1283 filt0, filt1, filt2, filt3);
1285 filt0, filt1, filt2, filt3);
1287 filt0, filt1, filt2, filt3);
1289 filt0, filt1, filt2, filt3);
1291 filt0, filt1, filt2, filt3);
1294 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1295 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1296 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1297 out3_r, tmp0, tmp1, tmp2, tmp3);
1299 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1300 dst_tmp += (4 * dst_stride);
1361 const int8_t *filter_x,
1362 const int8_t *filter_y,
1366 uint32_t res =
height & 0x07;
1368 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1369 v16i8 src9, src10, src11, src12, src13, src14;
1370 v8i16 filt0, filt1, filt2, filt3;
1371 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1372 v16i8 mask1, mask2, mask3;
1374 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1375 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1376 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1377 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1378 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1379 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1382 src -= ((3 * src_stride) + 3);
1383 filter_vec =
LD_SH(filter_x);
1384 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1386 filter_vec =
LD_SH(filter_y);
1389 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1396 src += (7 * src_stride);
1399 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1400 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1402 vec8, vec9, vec10, vec11);
1403 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1404 vec12, vec13, vec14, vec15);
1419 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1421 for (loop_cnt =
height >> 3; loop_cnt--;) {
1422 LD_SB8(
src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1424 src += (8 * src_stride);
1425 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1427 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1428 vec0, vec1, vec2, vec3);
1429 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1430 vec4, vec5, vec6, vec7);
1431 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1432 vec8, vec9, vec10, vec11);
1433 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1434 vec12, vec13, vec14, vec15);
1445 dst76_r = __msa_ilvr_h(dst117, dst66);
1448 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1449 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1450 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1452 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1453 filt_h1, filt_h2, filt_h3);
1454 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1455 filt_h1, filt_h2, filt_h3);
1456 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1457 filt_h1, filt_h2, filt_h3);
1458 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1459 filt_h1, filt_h2, filt_h3);
1460 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1461 filt_h1, filt_h2, filt_h3);
1462 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1463 filt_h1, filt_h2, filt_h3);
1464 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1465 filt_h1, filt_h2, filt_h3);
1467 filt_h0, filt_h1, filt_h2, filt_h3);
1469 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1470 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1473 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1474 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1475 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1476 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1479 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
1480 dst += (8 * dst_stride);
1483 dst32_r = dst1110_r;
1484 dst54_r = dst1312_r;
1486 dst43_r = dst1211_r;
1487 dst65_r = dst1413_r;
1488 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1491 LD_SB8(
src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1493 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1495 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1496 vec0, vec1, vec2, vec3);
1497 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1498 vec4, vec5, vec6, vec7);
1499 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1500 vec8, vec9, vec10, vec11);
1501 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1502 vec12, vec13, vec14, vec15);
1513 dst76_r = __msa_ilvr_h(dst117, dst66);
1516 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1517 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1518 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1520 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1521 filt_h1, filt_h2, filt_h3);
1522 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1523 filt_h1, filt_h2, filt_h3);
1524 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1525 filt_h1, filt_h2, filt_h3);
1526 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1527 filt_h1, filt_h2, filt_h3);
1528 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1529 filt_h1, filt_h2, filt_h3);
1530 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1531 filt_h1, filt_h2, filt_h3);
1532 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1533 filt_h1, filt_h2, filt_h3);
1535 filt_h0, filt_h1, filt_h2, filt_h3);
1537 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1538 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1541 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1542 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1543 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1544 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1548 ST_W2(out0, 0, 1,
dst, dst_stride);
1549 }
else if(res == 4) {
1550 ST_W4(out0, 0, 1, 2, 3,
dst, dst_stride);
1552 ST_W4(out0, 0, 1, 2, 3,
dst, dst_stride);
1553 ST_W2(out1, 0, 1,
dst + 4 * dst_stride, dst_stride);
1562 const int8_t *filter_x,
1563 const int8_t *filter_y,
1566 uint32_t loop_cnt, cnt;
1567 const uint8_t *src_tmp;
1570 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1571 v8i16 filt0, filt1, filt2, filt3;
1572 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1573 v16i8 mask1, mask2, mask3;
1575 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1576 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1577 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1578 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1579 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1580 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1581 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1582 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1585 src -= ((3 * src_stride) + 3);
1587 filter_vec =
LD_SH(filter_x);
1588 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1590 filter_vec =
LD_SH(filter_y);
1593 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1599 for (cnt =
width >> 3; cnt--;) {
1604 src_tmp += (7 * src_stride);
1609 vec0, vec1, vec2, vec3);
1611 vec4, vec5, vec6, vec7);
1613 vec8, vec9, vec10, vec11);
1614 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1615 vec12, vec13, vec14, vec15);
1625 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1626 vec0, vec1, vec2, vec3);
1627 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1628 vec4, vec5, vec6, vec7);
1629 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1630 vec8, vec9, vec10, vec11);
1638 for (loop_cnt =
height >> 1; loop_cnt--;) {
1639 LD_SB2(src_tmp, src_stride, src7, src8);
1641 src_tmp += 2 * src_stride;
1643 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1644 dst10_r, dst32_r, dst54_r, dst21_r);
1645 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1646 dst10_l, dst32_l, dst54_l, dst21_l);
1647 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1648 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1650 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1651 vec0, vec1, vec2, vec3);
1657 filt_h0, filt_h1, filt_h2, filt_h3);
1659 filt_h0, filt_h1, filt_h2, filt_h3);
1663 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1664 vec0, vec1, vec2, vec3);
1670 filt_h0, filt_h1, filt_h2, filt_h3);
1672 filt_h0, filt_h1, filt_h2, filt_h3);
1676 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1678 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1680 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
1681 dst_tmp += (2 * dst_stride);
1701 const int8_t *filter_x,
1702 const int8_t *filter_y,
1706 filter_x, filter_y,
height, 8);
1713 const int8_t *filter_x,
1714 const int8_t *filter_y,
1718 const uint8_t *src_tmp;
1721 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1722 v16i8 src11, src12, src13, src14;
1723 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1724 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1725 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1726 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1727 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1728 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1729 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1730 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1731 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1732 v8i16 dst1413_r, dst87_l, filter_vec;
1733 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1734 v4i32 dst0_l, dst1_l;
1736 src -= ((3 * src_stride) + 3);
1738 filter_vec =
LD_SH(filter_x);
1739 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1741 filter_vec =
LD_SH(filter_y);
1744 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1755 src_tmp += (7 * src_stride);
1763 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1774 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1775 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1776 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1785 for (loop_cnt = 8; loop_cnt--;) {
1786 LD_SB2(src_tmp, src_stride, src7, src8);
1788 src_tmp += 2 * src_stride;
1790 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1791 dst32_r, dst54_r, dst21_r);
1792 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1793 dst32_l, dst54_l, dst21_l);
1794 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1795 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1797 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1804 filt_h0, filt_h1, filt_h2, filt_h3);
1806 filt_h0, filt_h1, filt_h2, filt_h3);
1810 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1817 filt_h0, filt_h1, filt_h2, filt_h3);
1819 filt_h0, filt_h1, filt_h2, filt_h3);
1823 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1825 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1827 ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1828 dst_tmp += (2 * dst_stride);
1848 src += (7 * src_stride);
1851 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1852 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1853 VSHF_B4_SB(
src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1855 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1871 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1873 for (loop_cnt = 2; loop_cnt--;) {
1874 LD_SB8(
src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1876 src += (8 * src_stride);
1877 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1879 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1881 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1883 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1885 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1897 dst76_r = __msa_ilvr_h(dst117, dst66);
1900 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1901 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1902 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1904 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1905 filt_h1, filt_h2, filt_h3);
1906 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1907 filt_h1, filt_h2, filt_h3);
1908 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1909 filt_h1, filt_h2, filt_h3);
1910 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1911 filt_h1, filt_h2, filt_h3);
1912 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1913 filt_h1, filt_h2, filt_h3);
1914 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1915 filt_h1, filt_h2, filt_h3);
1916 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1917 filt_h1, filt_h2, filt_h3);
1919 filt_h0, filt_h1, filt_h2, filt_h3);
1921 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1922 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1925 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1926 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1927 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1928 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1931 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
1932 dst += (8 * dst_stride);
1935 dst32_r = dst1110_r;
1936 dst54_r = dst1312_r;
1938 dst43_r = dst1211_r;
1939 dst65_r = dst1413_r;
1940 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1948 const int8_t *filter_x,
1949 const int8_t *filter_y,
1953 filter_x, filter_y,
height, 16);
1960 const int8_t *filter_x,
1961 const int8_t *filter_y,
1965 filter_x, filter_y,
height, 24);
1972 const int8_t *filter_x,
1973 const int8_t *filter_y,
1977 filter_x, filter_y,
height, 32);
1984 const int8_t *filter_x,
1985 const int8_t *filter_y,
1989 filter_x, filter_y,
height, 48);
1996 const int8_t *filter_x,
1997 const int8_t *filter_y,
2001 filter_x, filter_y,
height, 64);
2008 v16i8 filt0, filt1,
src0,
src1, mask0, mask1, vec0, vec1;
2025 res0 = __msa_srari_h(res0, 6);
2026 res0 = __msa_sat_s_h(res0, 7);
2035 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
2036 v8i16
filt, out0, out1;
2051 filt0, filt1, out0, out1);
2062 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
2064 v8i16
filt, out0, out1, out2, out3;
2076 src += (4 * src_stride);
2080 filt0, filt1, out0, out1);
2084 filt0, filt1, out2, out3);
2090 ST_W4(
out, 0, 1, 2, 3,
dst + 4 * dst_stride, dst_stride);
2098 v16i8 filt0, filt1, mask0, mask1;
2100 v8i16
filt, out0, out1, out2, out3;
2112 src += (8 * src_stride);
2115 filt0, filt1, out0, out1);
2117 filt0, filt1, out2, out3);
2123 ST_W4(
out, 0, 1, 2, 3,
dst + 4 * dst_stride, dst_stride);
2124 dst += (8 * dst_stride);
2127 src += (8 * src_stride);
2130 filt0, filt1, out0, out1);
2132 filt0, filt1, out2, out3);
2138 ST_W4(
out, 0, 1, 2, 3,
dst + 4 * dst_stride, dst_stride);
2147 }
else if (4 ==
height) {
2149 }
else if (8 ==
height) {
2151 }
else if (16 ==
height) {
2160 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
2162 v8i16
filt, out0, out1, out2, out3;
2174 src += (4 * src_stride);
2178 filt1, out0, out1, out2, out3);
2183 ST_W2(out4, 0, 2,
dst, dst_stride);
2184 ST_H2(out4, 2, 6,
dst + 4, dst_stride);
2185 ST_W2(out5, 0, 2,
dst + 2 * dst_stride, dst_stride);
2186 ST_H2(out5, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
2187 dst += (4 * dst_stride);
2190 src += (4 * src_stride);
2194 filt1, out0, out1, out2, out3);
2199 ST_W2(out4, 0, 2,
dst, dst_stride);
2200 ST_H2(out4, 2, 6,
dst + 4, dst_stride);
2201 ST_W2(out5, 0, 2,
dst + 2 * dst_stride, dst_stride);
2202 ST_H2(out5, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
2210 v16i8
src0,
src1, filt0, filt1, mask0, mask1;
2212 v8i16
filt, vec0, vec1, vec2, vec3;
2222 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2224 src += (2 * src_stride);
2228 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2235 dst += (2 * dst_stride);
2244 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1;
2246 v8i16
filt, out0, out1, out2, out3;
2257 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2259 src += (4 * src_stride);
2263 filt1, out0, out1, out2, out3);
2268 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
2269 dst += (4 * dst_stride);
2291 v16i8
src0,
src1,
src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2292 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2295 v8i16
filt, out0, out1, out2, out3, out4, out5;
2309 for (loop_cnt = 4; loop_cnt--;) {
2311 src += (4 * src_stride);
2315 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2321 ST_W4(tmp0, 0, 1, 2, 3,
dst + 8, dst_stride);
2325 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2326 out2, out3, out4, out5);
2329 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2330 out2, out3, out4, out5);
2335 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
2336 dst += (4 * dst_stride);
2346 v16i8 filt0, filt1, mask0, mask1;
2347 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2348 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2360 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2363 src += (4 * src_stride);
2369 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2370 out0, out1, out2, out3);
2373 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2374 out0, out1, out2, out3);
2384 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2385 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2386 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2387 out4, out5, out6, out7);
2388 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2389 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2390 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2391 out4, out5, out6, out7);
2407 uint8_t *dst1 =
dst + 16;
2410 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2411 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2412 v8i16
filt, out0, out1, out2, out3;
2424 mask11 = mask0 + 10;
2426 for (loop_cnt = 8; loop_cnt--;) {
2429 src += (4 * src_stride);
2436 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2437 out0, out1, out2, out3);
2438 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2439 out0, out1, out2, out3);
2449 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2450 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2451 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2452 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2453 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2454 out0, out1, out2, out3);
2455 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2456 out0, out1, out2, out3);
2468 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2470 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2472 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2473 out0, out1, out2, out3);
2474 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2475 out0, out1, out2, out3);
2481 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2482 dst1 += (4 * dst_stride);
2492 v16i8 filt0, filt1, mask0, mask1;
2494 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2495 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2506 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2522 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2523 out0, out1, out2, out3);
2526 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2527 out0, out1, out2, out3);
2529 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2530 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2531 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2532 out4, out5, out6, out7);
2533 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2534 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2535 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2536 out4, out5, out6, out7);
2558 v16i8
src0,
src1,
src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2559 v16i8 src2110, src4332, filt0, filt1;
2569 src += (3 * src_stride);
2572 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2573 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2576 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2577 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2579 out10 = __msa_srari_h(out10, 6);
2580 out10 = __msa_sat_s_h(out10, 7);
2591 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2592 v16i8 src2110, src4332, filt0, filt1;
2593 v8i16
filt, out10, out32;
2602 src += (3 * src_stride);
2606 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2607 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2609 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2610 LD_SB3(
src, src_stride, src3, src4, src5);
2611 src += (3 * src_stride);
2613 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2614 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2618 src += (src_stride);
2620 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2621 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2627 dst += (4 * dst_stride);
2649 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2650 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2658 src += (3 * src_stride);
2663 src += (2 * src_stride);
2671 src += (2 * src_stride);
2673 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2679 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2682 ST_W2(out0, 0, 2,
dst, dst_stride);
2683 ST_H2(out0, 2, 6,
dst + 4, dst_stride);
2684 ST_W2(out1, 0, 2,
dst + 2 * dst_stride, dst_stride);
2685 ST_H2(out1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
2686 dst += (4 * dst_stride);
2689 src += (2 * src_stride);
2691 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2697 src += (2 * src_stride);
2699 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2705 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2708 ST_W2(out0, 0, 2,
dst, dst_stride);
2709 ST_H2(out0, 2, 6,
dst + 4, dst_stride);
2710 ST_W2(out1, 0, 2,
dst + 2 * dst_stride, dst_stride);
2711 ST_H2(out1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
2719 v8i16 src01, src12, src23, src34, tmp0, tmp1,
filt, filt0, filt1;
2745 uint64_t out0, out1, out2;
2747 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2748 v8i16
filt, filt0, filt1;
2757 src += (3 * src_stride);
2762 for (loop_cnt = 2; loop_cnt--;) {
2763 LD_SB3(
src, src_stride, src3, src4, src5);
2764 src += (3 * src_stride);
2767 ILVR_B3_SH(src3,
src2, src4, src3, src5, src4, vec1, vec3, vec4);
2772 tmp2 = __msa_srari_h(tmp2, 6);
2777 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2778 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2779 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2799 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2801 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
2809 src += (3 * src_stride);
2814 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2815 LD_SB4(
src, src_stride, src7, src8, src9, src10);
2816 src += (4 * src_stride);
2820 src72_r, src87_r, src98_r, src109_r);
2826 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2829 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
2830 dst += (4 * dst_stride);
2844 }
else if (6 ==
height) {
2859 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2860 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2861 v16i8 src2110, src4332, src6554;
2862 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2865 src -= (1 * src_stride);
2871 src += (3 * src_stride);
2876 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2878 for (loop_cnt = 4; loop_cnt--;) {
2879 LD_SB4(
src, src_stride, src3, src4, src5, src6);
2880 src += (4 * src_stride);
2885 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2886 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2887 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2888 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2899 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2903 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
2905 ST_W4(out0, 0, 1, 2, 3,
dst + 8, dst_stride);
2906 dst += (4 * dst_stride);
2921 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2922 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2923 v16u8 tmp0, tmp1, tmp2, tmp3;
2924 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2932 src += (3 * src_stride);
2938 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2939 LD_SB4(
src, src_stride, src3, src4, src5, src6);
2940 src += (4 * src_stride);
2944 src32_r, src43_r, src54_r, src65_r);
2946 src32_l, src43_l, src54_l, src65_l);
2957 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2958 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2959 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2960 out3_r, tmp0, tmp1, tmp2, tmp3);
2962 ST_UB4(tmp0, tmp1, tmp2, tmp3,
dst, dst_stride);
2963 dst += (4 * dst_stride);
2978 uint64_t out0, out1;
2979 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2980 v16i8 src11, filt0, filt1;
2981 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2982 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2984 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2998 LD_SB3(
src + 16, src_stride, src6, src7, src8);
2999 src += (3 * src_stride);
3001 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3003 for (loop_cnt = 8; loop_cnt--;) {
3011 LD_SB2(
src + 16, src_stride, src9, src10);
3012 src += (2 * src_stride);
3014 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3029 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3033 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
3035 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
3036 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
3051 LD_SB2(
src + 16, src_stride, src11, src8);
3052 src += (2 * src_stride);
3054 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3069 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3089 v16i8
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
3090 v16i8 src10_r, src32_r, src76_r, src98_r;
3091 v16i8 src21_r, src43_r, src87_r, src109_r;
3092 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3093 v16i8 src10_l, src32_l, src76_l, src98_l;
3094 v16i8 src21_l, src43_l, src87_l, src109_l;
3112 LD_SB3(
src + 16, src_stride, src6, src7, src8);
3113 src += (3 * src_stride);
3116 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3117 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3119 for (loop_cnt = (
height >> 1); loop_cnt--;) {
3134 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3147 LD_SB2(
src + 16, src_stride, src9, src10);
3148 src += (2 * src_stride);
3150 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3151 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3161 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3167 dst += 2 * dst_stride;
3181 const int8_t *filter_x,
3182 const int8_t *filter_y)
3187 v8i16 filt_h0, filt_h1;
3190 v8i16 filter_vec,
tmp;
3191 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3192 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3195 src -= (src_stride + 1);
3197 filter_vec =
LD_SH(filter_x);
3200 filter_vec =
LD_SH(filter_y);
3225 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3226 tmp = __msa_srari_h(
tmp, 6);
3227 tmp = __msa_sat_s_h(
tmp, 7);
3236 const int8_t *filter_x,
3237 const int8_t *filter_y)
3242 v8i16 filt_h0, filt_h1;
3245 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3246 v8i16 filter_vec, tmp0, tmp1;
3247 v8i16 dst30, dst41, dst52, dst63;
3248 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3249 v4i32 dst0, dst1, dst2, dst3;
3251 src -= (src_stride + 1);
3253 filter_vec =
LD_SH(filter_x);
3256 filter_vec =
LD_SH(filter_y);
3269 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3283 SRA_4V(dst0, dst1, dst2, dst3, 6);
3295 const int8_t *filter_x,
3296 const int8_t *filter_y,
3302 v16i8 src6, src7, src8, src9, src10;
3304 v8i16 filt_h0, filt_h1;
3307 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3308 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3309 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3310 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3311 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3312 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3313 v8i16 dst98_r, dst109_r;
3315 src -= (src_stride + 1);
3317 filter_vec =
LD_SH(filter_x);
3320 filter_vec =
LD_SH(filter_y);
3328 src += (3 * src_stride);
3337 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3339 for (loop_cnt =
height >> 3; loop_cnt--;) {
3341 src3, src4, src5, src6, src7, src8, src9, src10);
3342 src += (8 * src_stride);
3346 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3347 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3348 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3349 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3356 dst32_r = __msa_ilvr_h(dst73, dst22);
3360 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3361 dst76_r = __msa_ilvr_h(dst22, dst106);
3371 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3372 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3374 dst5_r, dst4_r, dst7_r, dst6_r,
3375 tmp0, tmp1, tmp2, tmp3);
3380 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
3381 dst += (8 * dst_stride);
3385 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3393 const int8_t *filter_x,
3394 const int8_t *filter_y,
3399 filter_x, filter_y);
3400 }
else if (4 ==
height) {
3402 filter_x, filter_y);
3403 }
else if (0 == (
height % 8)) {
3405 filter_x, filter_y,
height);
3413 const int8_t *filter_x,
3414 const int8_t *filter_y,
3417 v16u8 out0, out1, out2;
3419 v16i8 src7, src8, src9, src10;
3421 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3424 v8i16 filt_h0, filt_h1, filter_vec;
3425 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3426 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3427 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3428 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3429 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3430 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3431 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3432 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3433 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3435 src -= (src_stride + 1);
3437 filter_vec =
LD_SH(filter_x);
3440 filter_vec =
LD_SH(filter_y);
3448 src += (3 * src_stride);
3463 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3466 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3467 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3468 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3469 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3476 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3477 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3478 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3479 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3495 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3496 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3497 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3510 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3511 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3512 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3513 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3514 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3515 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3516 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3524 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
3525 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7,
dst + 4, dst_stride);
3532 const int8_t *filter_x,
3533 const int8_t *filter_y)
3538 v8i16 filt_h0, filt_h1, filter_vec;
3541 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3542 v8i16 dst0, dst1, dst2, dst3, dst4;
3543 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3544 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3545 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3546 v8i16 out0_r, out1_r;
3548 src -= (src_stride + 1);
3550 filter_vec =
LD_SH(filter_x);
3553 filter_vec =
LD_SH(filter_y);
3566 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3567 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3582 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3583 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3594 const int8_t *filter_x,
3595 const int8_t *filter_y,
3600 v16i8
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
3601 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3602 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3603 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3604 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3606 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3608 src -= (src_stride + 1);
3610 filter_vec =
LD_SH(filter_x);
3613 filter_vec =
LD_SH(filter_y);
3621 for (cnt = width8mult; cnt--;) {
3637 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3638 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3639 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3640 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3661 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3662 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3664 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3665 dst3_r, tmp0, tmp1, tmp2, tmp3);
3670 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
3679 const int8_t *filter_x,
3680 const int8_t *filter_y)
3682 v16u8 out0, out1, out2;
3683 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
3685 v8i16 filt_h0, filt_h1, filter_vec;
3688 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3689 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3690 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3691 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3692 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3693 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3694 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3695 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3696 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3697 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3699 src -= (src_stride + 1);
3701 filter_vec =
LD_SH(filter_x);
3704 filter_vec =
LD_SH(filter_y);
3712 src += (5 * src_stride);
3713 LD_SB4(
src, src_stride, src5, src6, src7, src8);
3721 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3722 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3723 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3724 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3725 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3726 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3760 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3761 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3762 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3764 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3765 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3768 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3774 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
3775 ST_D2(out2, 0, 1,
dst + 4 * dst_stride, dst_stride);
3782 const int8_t *filter_x,
3783 const int8_t *filter_y,
3787 uint32_t loop_cnt, cnt;
3788 const uint8_t *src_tmp;
3793 v8i16 filt_h0, filt_h1, filter_vec;
3796 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3797 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3798 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3799 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3800 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3801 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3802 v8i16 out0_r, out1_r, out2_r, out3_r;
3804 src -= (src_stride + 1);
3806 filter_vec =
LD_SH(filter_x);
3809 filter_vec =
LD_SH(filter_y);
3816 for (cnt = width8mult; cnt--;) {
3821 src_tmp += (3 * src_stride);
3836 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3837 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3838 src_tmp += (4 * src_stride);
3842 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3843 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3844 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3845 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3866 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3867 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3870 dst2_l, dst2_r, dst3_l, dst3_r,
3871 out0_r, out1_r, out2_r, out3_r);
3874 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3877 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3878 dst_tmp += (4 * dst_stride);
3896 const int8_t *filter_x,
3897 const int8_t *filter_y,
3902 filter_x, filter_y);
3903 }
else if (4 ==
height) {
3905 filter_x, filter_y, 1);
3906 }
else if (6 ==
height) {
3908 filter_x, filter_y);
3909 }
else if (0 == (
height % 4)) {
3911 filter_x, filter_y,
height, 1);
3919 const int8_t *filter_x,
3920 const int8_t *filter_y,
3924 const uint8_t *src_tmp;
3927 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
3928 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3929 v16i8 mask0, mask1, mask2, mask3;
3930 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3931 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3932 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3933 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3934 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3935 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3936 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3937 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3939 src -= (src_stride + 1);
3941 filter_vec =
LD_SH(filter_x);
3944 filter_vec =
LD_SH(filter_y);
3956 src_tmp += (3 * src_stride);
3971 for (loop_cnt = 4; loop_cnt--;) {
3972 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3973 src_tmp += (4 * src_stride);
3976 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3977 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3978 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3979 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4000 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4001 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4003 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4004 dst3_r, tmp0, tmp1, tmp2, tmp3);
4009 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4010 dst_tmp += (4 * dst_stride);
4026 src += (3 * src_stride);
4035 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4037 for (loop_cnt = 2; loop_cnt--;) {
4039 src3, src4, src5, src6, src7, src8, src9, src10);
4040 src += (8 * src_stride);
4042 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4043 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4044 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4045 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4052 dst32_r = __msa_ilvr_h(dst73, dst22);
4056 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4057 dst76_r = __msa_ilvr_h(dst22, dst106);
4067 SRA_4V(dst0, dst1, dst2, dst3, 6);
4068 SRA_4V(dst4, dst5, dst6, dst7, 6);
4069 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4070 tmp0, tmp1, tmp2, tmp3);
4075 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
4076 dst += (8 * dst_stride);
4080 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4088 const int8_t *filter_x,
4089 const int8_t *filter_y,
4097 filter_x, filter_y,
height, 2);
4105 const int8_t *filter_x,
4106 const int8_t *filter_y,
4110 filter_x, filter_y,
height, 3);
4117 const int8_t *filter_x,
4118 const int8_t *filter_y,
4122 filter_x, filter_y,
height, 4);
4125 #define UNI_MC_COPY(WIDTH) \
4126 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4127 ptrdiff_t dst_stride, \
4128 const uint8_t *src, \
4129 ptrdiff_t src_stride, \
4135 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4148 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4149 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4150 ptrdiff_t dst_stride, \
4151 const uint8_t *src, \
4152 ptrdiff_t src_stride, \
4158 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
4160 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4200 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4201 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4202 ptrdiff_t dst_stride, \
4203 const uint8_t *src, \
4204 ptrdiff_t src_stride, \
4210 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
4211 const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
4213 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4214 filter_x, filter_y, height); \