27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 35 mask0, mask1, mask2, mask3, \ 36 filt0, filt1, filt2, filt3, \ 39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \ 51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 52 mask0, mask1, mask2, mask3, \ 53 filt0, filt1, filt2, filt3, \ 54 out0, out1, out2, out3) \ 56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 61 out0, out1, out2, out3); \ 62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 65 out0, out1, out2, out3); \ 66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 69 out0, out1, out2, out3); \ 70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 73 out0, out1, out2, out3); \ 76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 77 mask0, mask1, filt0, filt1, \ 80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 89 mask0, mask1, filt0, filt1, \ 90 out0, out1, out2, out3) \ 92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 97 out0, out1, out2, out3); \ 98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 101 out0, out1, out2, out3); \ 109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 }
else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 }
else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 }
else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
150 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
165 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 }
else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 }
else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
201 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
224 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
241 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
262 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
290 v16u8 mask0, mask1, mask2, mask3,
out;
291 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
292 v8i16
filt, out0, out1;
298 filt =
LD_SH(filter);
299 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
305 LD_SB4(src, src_stride, src0, src1, src2, src3);
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
312 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
319 v16i8 filt0, filt1, filt2, filt3;
321 v16u8 mask0, mask1, mask2, mask3,
out;
322 v8i16
filt, out0, out1, out2, out3;
328 filt =
LD_SH(filter);
329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
335 LD_SB4(src, src_stride, src0, src1, src2, src3);
337 src += (4 * src_stride);
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
347 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
349 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
356 v16u8 mask0, mask1, mask2, mask3,
out;
357 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
358 v8i16
filt, out0, out1, out2, out3;
364 filt =
LD_SH(filter);
365 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
371 LD_SB4(src, src_stride, src0, src1, src2, src3);
373 src += (4 * src_stride);
375 mask3, filt0, filt1, filt2, filt3, out0, out1);
376 LD_SB4(src, src_stride, src0, src1, src2, src3);
378 src += (4 * src_stride);
380 mask3, filt0, filt1, filt2, filt3, out2, out3);
384 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
386 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387 dst += (8 * dst_stride);
389 LD_SB4(src, src_stride, src0, src1, src2, src3);
391 src += (4 * src_stride);
393 mask3, filt0, filt1, filt2, filt3, out0, out1);
394 LD_SB4(src, src_stride, src0, src1, src2, src3);
396 src += (4 * src_stride);
398 mask3, filt0, filt1, filt2, filt3, out2, out3);
403 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
405 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
414 }
else if (8 == height) {
416 }
else if (16 == height) {
426 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
427 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429 v8i16
filt, out0, out1, out2, out3;
435 filt =
LD_SH(filter);
436 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
442 for (loop_cnt = (height >> 2); loop_cnt--;) {
443 LD_SB4(src, src_stride, src0, src1, src2, src3);
445 src += (4 * src_stride);
447 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450 out0, out1, out2, out3);
451 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454 out0, out1, out2, out3);
455 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458 out0, out1, out2, out3);
459 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462 out0, out1, out2, out3);
468 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469 dst += (4 * dst_stride);
478 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479 v16u8 tmp0, tmp1, tmp2;
480 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
481 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482 v16i8 filt0, filt1, filt2, filt3;
483 v8i16
filt, out0, out1, out2, out3, out4, out5;
491 filt =
LD_SH(filter);
492 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
501 for (loop_cnt = 4; loop_cnt--;) {
503 LD_SB4(src, src_stride, src0, src1, src2, src3);
505 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
509 src += (4 * src_stride);
511 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
515 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
519 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
523 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
529 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
531 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
533 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
535 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
546 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548 dst += (4 * dst_stride);
557 v16u8 mask0, mask1, mask2, mask3,
out;
558 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
559 v16i8 filt0, filt1, filt2, filt3;
560 v8i16
filt, out0, out1, out2, out3;
566 filt =
LD_SH(filter);
567 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
573 for (loop_cnt = (height >> 2); loop_cnt--;) {
574 LD_SB2(src, src_stride, src0, src2);
575 LD_SB2(src + 8, src_stride, src1, src3);
576 src += (2 * src_stride);
578 LD_SB2(src, src_stride, src4, src6);
579 LD_SB2(src + 8, src_stride, src5, src7);
580 src += (2 * src_stride);
585 mask3, filt0, filt1, filt2, filt3, out0,
597 mask3, filt0, filt1, filt2, filt3, out0,
615 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
616 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
619 v8i16 out0, out1, out2, out3, out8, out9,
filt;
625 filt =
LD_SH(filter);
626 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
636 for (loop_cnt = 16; loop_cnt--;) {
637 LD_SB2(src, src_stride, src0, src2);
638 LD_SB2(src + 16, src_stride, src1, src3);
640 src += (2 * src_stride);
641 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
647 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651 out0, out8, out2, out9);
653 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657 out0, out8, out2, out9);
659 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663 out0, out8, out2, out9);
670 ST_D2(out, 0, 1, dst + 16, dst_stride);
685 v16u8 mask0, mask1, mask2, mask3,
out;
686 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
687 v16i8 filt0, filt1, filt2, filt3;
688 v8i16
filt, out0, out1, out2, out3;
694 filt =
LD_SH(filter);
695 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
701 for (loop_cnt = (height >> 1); loop_cnt--;) {
703 src1 =
LD_SB(src + 8);
704 src2 =
LD_SB(src + 16);
705 src3 =
LD_SB(src + 24);
710 src5 =
LD_SB(src + 8);
711 src6 =
LD_SB(src + 16);
712 src7 =
LD_SB(src + 24);
717 mask3, filt0, filt1, filt2, filt3, out0,
725 ST_UB(out, dst + 16);
729 mask3, filt0, filt1, filt2, filt3, out0,
736 ST_UB(out, dst + 16);
746 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
748 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
749 v8i16
filt, out0, out1, out2, out3;
755 filt =
LD_SH(filter);
756 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
766 for (loop_cnt = 64; loop_cnt--;) {
768 src1 =
LD_SB(src + 8);
769 src2 =
LD_SB(src + 16);
770 src3 =
LD_SB(src + 32);
771 src4 =
LD_SB(src + 40);
775 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
777 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
779 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
783 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
787 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
789 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
792 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
795 out3 = __msa_srari_h(out2, 6);
800 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
802 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
806 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
810 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
817 out2 = __msa_srari_h(out2, 6);
820 ST_UB(out, dst + 16);
822 ST_UB(out, dst + 32);
832 v16u8 mask0, mask1, mask2, mask3,
out;
833 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835 v16i8 filt0, filt1, filt2, filt3;
836 v8i16 res0, res1, res2, res3,
filt;
842 filt =
LD_SH(filter);
843 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
849 for (loop_cnt = height; loop_cnt--;) {
850 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
855 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
859 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
863 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
867 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
877 ST_UB(out, dst + 16);
879 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
883 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
887 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
891 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
899 ST_UB(out, dst + 32);
901 ST_UB(out, dst + 48);
912 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913 v16i8 src11, src12, src13, src14;
914 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917 v16i8 src10998, filt0, filt1, filt2, filt3;
918 v8i16
filt, out10, out32, out54, out76;
920 src -= (3 * src_stride);
922 filt =
LD_SH(filter);
923 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
925 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926 src += (7 * src_stride);
928 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
930 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
935 for (loop_cnt = (height >> 3); loop_cnt--;) {
936 LD_SB4(src, src_stride, src7, src8, src9, src10);
937 src += (4 * src_stride);
938 LD_SB4(src, src_stride, src11, src12, src13, src14);
939 src += (4 * src_stride);
941 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942 src87_r, src98_r, src109_r);
943 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944 src1110_r, src1211_r, src1312_r, src1413_r);
945 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947 src12111110, src14131312);
951 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
965 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966 dst += (8 * dst_stride);
969 src4332 = src12111110;
970 src6554 = src14131312;
980 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
984 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
986 src -= (3 * src_stride);
988 filt =
LD_SH(filter);
989 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
991 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
993 src += (7 * src_stride);
994 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
996 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
998 for (loop_cnt = (height >> 2); loop_cnt--;) {
999 LD_SB4(src, src_stride, src7, src8, src9, src10);
1001 src += (4 * src_stride);
1003 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004 src87_r, src98_r, src109_r);
1005 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006 filt0, out0_r, out1_r, out2_r, out3_r);
1007 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008 filt1, out0_r, out1_r, out2_r, out3_r);
1009 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010 filt2, out0_r, out1_r, out2_r, out3_r);
1011 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012 filt3, out0_r, out1_r, out2_r, out3_r);
1014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1017 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018 dst += (4 * dst_stride);
1035 uint32_t out2, out3;
1036 uint64_t out0, out1;
1037 v16u8 tmp0, tmp1, tmp2, tmp3;
1038 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039 v16i8 filt0, filt1, filt2, filt3;
1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1045 src -= (3 * src_stride);
1047 filt =
LD_SH(filter);
1048 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1050 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051 src += (7 * src_stride);
1055 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1060 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1062 for (loop_cnt = 4; loop_cnt--;) {
1063 LD_SB4(src, src_stride, src7, src8, src9, src10);
1065 src += (4 * src_stride);
1067 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068 src87_r, src98_r, src109_r);
1069 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070 src87_l, src98_l, src109_l);
1072 filt1, filt2, filt3);
1074 filt1, filt2, filt3);
1076 filt1, filt2, filt3);
1078 filt1, filt2, filt3);
1080 filt1, filt2, filt3);
1082 filt1, filt2, filt3);
1084 filt1, filt2, filt3);
1086 filt1, filt2, filt3);
1089 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092 out3_r, tmp0, tmp1, tmp2, tmp3);
1095 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1100 SW(out2, (dst + 8));
1103 SW(out3, (dst + 8));
1105 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1110 SW(out2, (dst + 8));
1113 SW(out3, (dst + 8));
1137 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138 v16i8 filt0, filt1, filt2, filt3;
1139 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142 v16u8 tmp0, tmp1, tmp2, tmp3;
1143 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1145 src -= (3 * src_stride);
1147 filt =
LD_SH(filter);
1148 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1150 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1152 src += (7 * src_stride);
1153 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1155 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1158 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1160 for (loop_cnt = (height >> 2); loop_cnt--;) {
1161 LD_SB4(src, src_stride, src7, src8, src9, src10);
1163 src += (4 * src_stride);
1165 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166 src87_r, src98_r, src109_r);
1167 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168 src87_l, src98_l, src109_l);
1170 filt1, filt2, filt3);
1172 filt1, filt2, filt3);
1174 filt1, filt2, filt3);
1176 filt1, filt2, filt3);
1178 filt1, filt2, filt3);
1180 filt1, filt2, filt3);
1182 filt1, filt2, filt3);
1184 filt1, filt2, filt3);
1187 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190 out3_r, tmp0, tmp1, tmp2, tmp3);
1192 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193 dst += (4 * dst_stride);
1218 uint32_t loop_cnt, cnt;
1219 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220 v16i8 filt0, filt1, filt2, filt3;
1221 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224 v16u8 tmp0, tmp1, tmp2, tmp3;
1225 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1227 src -= (3 * src_stride);
1229 filt =
LD_SH(filter);
1230 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1232 for (cnt = (width >> 4); cnt--;) {
1236 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1238 src_tmp += (7 * src_stride);
1239 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240 src32_r, src54_r, src21_r);
1241 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243 src32_l, src54_l, src21_l);
1244 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1246 for (loop_cnt = (height >> 2); loop_cnt--;) {
1247 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1249 src_tmp += (4 * src_stride);
1250 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251 src87_r, src98_r, src109_r);
1252 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253 src87_l, src98_l, src109_l);
1255 filt0, filt1, filt2, filt3);
1257 filt0, filt1, filt2, filt3);
1259 filt0, filt1, filt2, filt3);
1261 filt0, filt1, filt2, filt3);
1263 filt0, filt1, filt2, filt3);
1265 filt0, filt1, filt2, filt3);
1267 filt0, filt1, filt2, filt3);
1269 filt0, filt1, filt2, filt3);
1272 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275 out3_r, tmp0, tmp1, tmp2, tmp3);
1277 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278 dst_tmp += (4 * dst_stride);
1339 const int8_t *filter_x,
1340 const int8_t *filter_y,
1345 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 src9, src10, src11, src12, src13, src14;
1347 v8i16 filt0, filt1, filt2, filt3;
1348 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349 v16i8 mask1, mask2, mask3;
1351 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1359 src -= ((3 * src_stride) + 3);
1360 filter_vec =
LD_SH(filter_x);
1361 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1363 filter_vec =
LD_SH(filter_y);
1366 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1372 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373 src += (7 * src_stride);
1376 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379 vec8, vec9, vec10, vec11);
1380 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381 vec12, vec13, vec14, vec15);
1396 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1398 for (loop_cnt = height >> 3; loop_cnt--;) {
1399 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1401 src += (8 * src_stride);
1402 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1404 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405 vec0, vec1, vec2, vec3);
1406 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407 vec4, vec5, vec6, vec7);
1408 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409 vec8, vec9, vec10, vec11);
1410 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411 vec12, vec13, vec14, vec15);
1422 dst76_r = __msa_ilvr_h(dst117, dst66);
1425 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1429 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430 filt_h1, filt_h2, filt_h3);
1431 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432 filt_h1, filt_h2, filt_h3);
1433 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434 filt_h1, filt_h2, filt_h3);
1435 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436 filt_h1, filt_h2, filt_h3);
1437 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438 filt_h1, filt_h2, filt_h3);
1439 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440 filt_h1, filt_h2, filt_h3);
1441 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442 filt_h1, filt_h2, filt_h3);
1444 filt_h0, filt_h1, filt_h2, filt_h3);
1446 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1456 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457 dst += (8 * dst_stride);
1460 dst32_r = dst1110_r;
1461 dst54_r = dst1312_r;
1463 dst43_r = dst1211_r;
1464 dst65_r = dst1413_r;
1465 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1473 const int8_t *filter_x,
1474 const int8_t *filter_y,
1477 uint32_t loop_cnt, cnt;
1481 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1482 v8i16 filt0, filt1, filt2, filt3;
1483 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484 v16i8 mask1, mask2, mask3;
1486 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1496 src -= ((3 * src_stride) + 3);
1498 filter_vec =
LD_SH(filter_x);
1499 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1501 filter_vec =
LD_SH(filter_y);
1504 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1510 for (cnt = width >> 3; cnt--;) {
1514 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515 src_tmp += (7 * src_stride);
1519 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520 vec0, vec1, vec2, vec3);
1521 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522 vec4, vec5, vec6, vec7);
1523 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524 vec8, vec9, vec10, vec11);
1525 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526 vec12, vec13, vec14, vec15);
1536 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537 vec0, vec1, vec2, vec3);
1538 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539 vec4, vec5, vec6, vec7);
1540 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541 vec8, vec9, vec10, vec11);
1549 for (loop_cnt = height >> 1; loop_cnt--;) {
1550 LD_SB2(src_tmp, src_stride, src7, src8);
1552 src_tmp += 2 * src_stride;
1554 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555 dst10_r, dst32_r, dst54_r, dst21_r);
1556 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557 dst10_l, dst32_l, dst54_l, dst21_l);
1558 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1561 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562 vec0, vec1, vec2, vec3);
1568 filt_h0, filt_h1, filt_h2, filt_h3);
1570 filt_h0, filt_h1, filt_h2, filt_h3);
1574 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575 vec0, vec1, vec2, vec3);
1581 filt_h0, filt_h1, filt_h2, filt_h3);
1583 filt_h0, filt_h1, filt_h2, filt_h3);
1587 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1589 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1591 ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592 dst_tmp += (2 * dst_stride);
1612 const int8_t *filter_x,
1613 const int8_t *filter_y,
1617 filter_x, filter_y, height, 8);
1624 const int8_t *filter_x,
1625 const int8_t *filter_y,
1631 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632 v16i8 src11, src12, src13, src14;
1633 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642 v8i16 dst1413_r, dst87_l, filter_vec;
1643 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644 v4i32 dst0_l, dst1_l;
1646 src -= ((3 * src_stride) + 3);
1648 filter_vec =
LD_SH(filter_x);
1649 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1651 filter_vec =
LD_SH(filter_y);
1654 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1664 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665 src_tmp += (7 * src_stride);
1669 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1673 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1684 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1695 for (loop_cnt = 8; loop_cnt--;) {
1696 LD_SB2(src_tmp, src_stride, src7, src8);
1698 src_tmp += 2 * src_stride;
1700 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701 dst32_r, dst54_r, dst21_r);
1702 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703 dst32_l, dst54_l, dst21_l);
1704 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1707 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1714 filt_h0, filt_h1, filt_h2, filt_h3);
1716 filt_h0, filt_h1, filt_h2, filt_h3);
1720 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1727 filt_h0, filt_h1, filt_h2, filt_h3);
1729 filt_h0, filt_h1, filt_h2, filt_h3);
1733 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1735 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1737 ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738 dst_tmp += (2 * dst_stride);
1757 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758 src += (7 * src_stride);
1761 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1765 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1781 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1783 for (loop_cnt = 2; loop_cnt--;) {
1784 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1786 src += (8 * src_stride);
1787 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1789 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1791 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1793 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1795 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1807 dst76_r = __msa_ilvr_h(dst117, dst66);
1810 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1814 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815 filt_h1, filt_h2, filt_h3);
1816 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817 filt_h1, filt_h2, filt_h3);
1818 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819 filt_h1, filt_h2, filt_h3);
1820 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821 filt_h1, filt_h2, filt_h3);
1822 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823 filt_h1, filt_h2, filt_h3);
1824 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825 filt_h1, filt_h2, filt_h3);
1826 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827 filt_h1, filt_h2, filt_h3);
1829 filt_h0, filt_h1, filt_h2, filt_h3);
1831 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1841 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842 dst += (8 * dst_stride);
1845 dst32_r = dst1110_r;
1846 dst54_r = dst1312_r;
1848 dst43_r = dst1211_r;
1849 dst65_r = dst1413_r;
1850 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1858 const int8_t *filter_x,
1859 const int8_t *filter_y,
1863 filter_x, filter_y, height, 16);
1870 const int8_t *filter_x,
1871 const int8_t *filter_y,
1875 filter_x, filter_y, height, 24);
1882 const int8_t *filter_x,
1883 const int8_t *filter_y,
1887 filter_x, filter_y, height, 32);
1894 const int8_t *filter_x,
1895 const int8_t *filter_y,
1899 filter_x, filter_y, height, 48);
1906 const int8_t *filter_x,
1907 const int8_t *filter_y,
1911 filter_x, filter_y, height, 64);
1918 v16i8 filt0, filt1,
src0,
src1, mask0, mask1, vec0, vec1;
1926 filt =
LD_SH(filter);
1931 LD_SB2(src, src_stride, src0, src1);
1933 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1935 res0 = __msa_srari_h(res0, 6);
1936 res0 = __msa_sat_s_h(res0, 7);
1938 ST_W2(out, 0, 1, dst, dst_stride);
1945 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1946 v8i16
filt, out0, out1;
1953 filt =
LD_SH(filter);
1958 LD_SB4(src, src_stride, src0, src1, src2, src3);
1961 filt0, filt1, out0, out1);
1965 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1972 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1974 v8i16
filt, out0, out1, out2, out3;
1980 filt =
LD_SH(filter);
1985 LD_SB4(src, src_stride, src0, src1, src2, src3);
1986 src += (4 * src_stride);
1990 filt0, filt1, out0, out1);
1991 LD_SB4(src, src_stride, src0, src1, src2, src3);
1994 filt0, filt1, out2, out3);
1998 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2000 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2007 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2008 v16i8 filt0, filt1, mask0, mask1;
2010 v8i16
filt, out0, out1, out2, out3;
2016 filt =
LD_SH(filter);
2021 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022 src += (8 * src_stride);
2025 filt0, filt1, out0, out1);
2027 filt0, filt1, out2, out3);
2031 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2033 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034 dst += (8 * dst_stride);
2036 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037 src += (8 * src_stride);
2040 filt0, filt1, out0, out1);
2042 filt0, filt1, out2, out3);
2046 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2048 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2057 }
else if (4 == height) {
2059 }
else if (8 == height) {
2061 }
else if (16 == height) {
2070 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
2072 v8i16
filt, out0, out1, out2, out3;
2078 filt =
LD_SH(filter);
2083 LD_SB4(src, src_stride, src0, src1, src2, src3);
2084 src += (4 * src_stride);
2088 filt1, out0, out1, out2, out3);
2093 ST_W2(out4, 0, 2, dst, dst_stride);
2094 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097 dst += (4 * dst_stride);
2099 LD_SB4(src, src_stride, src0, src1, src2, src3);
2100 src += (4 * src_stride);
2104 filt1, out0, out1, out2, out3);
2109 ST_W2(out4, 0, 2, dst, dst_stride);
2110 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2120 v16i8
src0,
src1, filt0, filt1, mask0, mask1;
2122 v8i16
filt, vec0, vec1, vec2, vec3;
2127 filt =
LD_SH(filter);
2132 for (loop_cnt = (height >> 1); loop_cnt--;) {
2133 LD_SB2(src, src_stride, src0, src1);
2134 src += (2 * src_stride);
2137 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2144 ST_D2(out, 0, 1, dst, dst_stride);
2145 dst += (2 * dst_stride);
2154 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
2156 v8i16
filt, out0, out1, out2, out3;
2162 filt =
LD_SH(filter);
2167 for (loop_cnt = (height >> 2); loop_cnt--;) {
2168 LD_SB4(src, src_stride, src0, src1, src2, src3);
2169 src += (4 * src_stride);
2173 filt1, out0, out1, out2, out3);
2178 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179 dst += (4 * dst_stride);
2187 if ((2 == height) || (6 == height)) {
2201 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2205 v8i16
filt, out0, out1, out2, out3, out4, out5;
2213 filt =
LD_SH(filter);
2219 for (loop_cnt = 4; loop_cnt--;) {
2220 LD_SB4(src, src_stride, src0, src1, src2, src3);
2221 src += (4 * src_stride);
2224 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2231 ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2233 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236 out2, out3, out4, out5);
2237 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240 out2, out3, out4, out5);
2245 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246 dst += (4 * dst_stride);
2255 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2256 v16i8 filt0, filt1, mask0, mask1;
2257 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2265 filt =
LD_SH(filter);
2270 for (loop_cnt = (height >> 2); loop_cnt--;) {
2271 LD_SB4(src, src_stride, src0, src2, src4, src6);
2272 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273 src += (4 * src_stride);
2277 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280 out0, out1, out2, out3);
2281 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284 out0, out1, out2, out3);
2294 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297 out4, out5, out6, out7);
2298 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301 out4, out5, out6, out7);
2319 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2320 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322 v8i16
filt, out0, out1, out2, out3;
2329 filt =
LD_SH(filter);
2334 mask11 = mask0 + 10;
2336 for (loop_cnt = 8; loop_cnt--;) {
2337 LD_SB4(src, src_stride, src0, src2, src4, src6);
2338 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339 src += (4 * src_stride);
2342 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347 out0, out1, out2, out3);
2348 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349 out0, out1, out2, out3);
2359 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364 out0, out1, out2, out3);
2365 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366 out0, out1, out2, out3);
2377 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2382 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383 out0, out1, out2, out3);
2384 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385 out0, out1, out2, out3);
2391 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392 dst1 += (4 * dst_stride);
2401 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2402 v16i8 filt0, filt1, mask0, mask1;
2404 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2411 filt =
LD_SH(filter);
2416 for (loop_cnt = (height >> 1); loop_cnt--;) {
2418 src1 =
LD_SB(src + 8);
2419 src2 =
LD_SB(src + 16);
2420 src3 =
LD_SB(src + 24);
2423 src5 =
LD_SB(src + 8);
2424 src6 =
LD_SB(src + 16);
2425 src7 =
LD_SB(src + 24);
2430 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433 out0, out1, out2, out3);
2434 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437 out0, out1, out2, out3);
2439 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442 out4, out5, out6, out7);
2443 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446 out4, out5, out6, out7);
2454 ST_UB(out, dst + 16);
2459 ST_UB(out, dst + 16);
2468 v16i8
src0,
src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469 v16i8 src2110, src4332, filt0, filt1;
2475 filt =
LD_SH(filter);
2478 LD_SB3(src, src_stride, src0, src1, src2);
2479 src += (3 * src_stride);
2481 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484 LD_SB2(src, src_stride, src3, src4);
2485 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2489 out10 = __msa_srari_h(out10, 6);
2490 out10 = __msa_sat_s_h(out10, 7);
2492 ST_W2(out, 0, 1, dst, dst_stride);
2500 v16i8
src0,
src1, src2, src3, src4, src5;
2501 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502 v16i8 src2110, src4332, filt0, filt1;
2503 v8i16
filt, out10, out32;
2508 filt =
LD_SH(filter);
2511 LD_SB3(src, src_stride, src0, src1, src2);
2512 src += (3 * src_stride);
2514 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2516 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2519 for (loop_cnt = (height >> 2); loop_cnt--;) {
2520 LD_SB3(src, src_stride, src3, src4, src5);
2521 src += (3 * src_stride);
2522 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2528 src += (src_stride);
2529 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2536 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537 dst += (4 * dst_stride);
2558 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2559 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2564 filter_vec =
LD_SH(filter);
2567 LD_SB3(src, src_stride, src0, src1, src2);
2568 src += (3 * src_stride);
2570 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2572 LD_SB2(src, src_stride, src3, src4);
2573 src += (2 * src_stride);
2575 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2580 LD_SB2(src, src_stride, src5, src6);
2581 src += (2 * src_stride);
2583 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2589 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2592 ST_W2(out0, 0, 2, dst, dst_stride);
2593 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596 dst += (4 * dst_stride);
2598 LD_SB2(src, src_stride, src3, src4);
2599 src += (2 * src_stride);
2601 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2606 LD_SB2(src, src_stride, src5, src6);
2607 src += (2 * src_stride);
2609 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2615 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2618 ST_W2(out0, 0, 2, dst, dst_stride);
2619 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2628 v16i8
src0,
src1, src2, src3, src4;
2629 v8i16 src01, src12, src23, src34, tmp0, tmp1,
filt, filt0, filt1;
2635 filt =
LD_SH(filter);
2638 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2640 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2642 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2647 ST_D2(out, 0, 1, dst, dst_stride);
2655 uint64_t out0, out1, out2;
2656 v16i8
src0,
src1, src2, src3, src4, src5;
2657 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658 v8i16
filt, filt0, filt1;
2663 filt =
LD_SH(filter);
2666 LD_SB3(src, src_stride, src0, src1, src2);
2667 src += (3 * src_stride);
2670 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2672 for (loop_cnt = 2; loop_cnt--;) {
2673 LD_SB3(src, src_stride, src3, src4, src5);
2674 src += (3 * src_stride);
2677 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2682 tmp2 = __msa_srari_h(tmp2, 6);
2687 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2708 v16i8
src0,
src1, src2, src7, src8, src9, src10;
2709 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2711 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
2715 filt =
LD_SH(filter);
2718 LD_SB3(src, src_stride, src0, src1, src2);
2719 src += (3 * src_stride);
2722 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2724 for (loop_cnt = (height >> 2); loop_cnt--;) {
2725 LD_SB4(src, src_stride, src7, src8, src9, src10);
2726 src += (4 * src_stride);
2729 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730 src72_r, src87_r, src98_r, src109_r);
2736 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2739 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740 dst += (4 * dst_stride);
2754 }
else if (6 == height) {
2767 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2769 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771 v16i8 src2110, src4332, src6554;
2772 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2775 src -= (1 * src_stride);
2777 filter_vec =
LD_SH(filter);
2780 LD_SB3(src, src_stride, src0, src1, src2);
2781 src += (3 * src_stride);
2784 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2788 for (loop_cnt = 4; loop_cnt--;) {
2789 LD_SB4(src, src_stride, src3, src4, src5, src6);
2790 src += (4 * src_stride);
2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2809 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2813 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2815 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816 dst += (4 * dst_stride);
2830 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2831 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833 v16u8 tmp0, tmp1, tmp2, tmp3;
2834 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2838 filt =
LD_SH(filter);
2841 LD_SB3(src, src_stride, src0, src1, src2);
2842 src += (3 * src_stride);
2845 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 for (loop_cnt = (height >> 2); loop_cnt--;) {
2849 LD_SB4(src, src_stride, src3, src4, src5, src6);
2850 src += (4 * src_stride);
2853 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854 src32_r, src43_r, src54_r, src65_r);
2855 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856 src32_l, src43_l, src54_l, src65_l);
2867 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870 out3_r, tmp0, tmp1, tmp2, tmp3);
2872 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873 dst += (4 * dst_stride);
2888 uint64_t out0, out1;
2889 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890 v16i8 src11, filt0, filt1;
2891 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2894 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2898 filt =
LD_SH(filter);
2902 LD_SB3(src, src_stride, src0, src1, src2);
2904 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2908 LD_SB3(src + 16, src_stride, src6, src7, src8);
2909 src += (3 * src_stride);
2911 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2913 for (loop_cnt = 8; loop_cnt--;) {
2915 LD_SB2(src, src_stride, src3, src4);
2917 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2921 LD_SB2(src + 16, src_stride, src9, src10);
2922 src += (2 * src_stride);
2924 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2939 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2943 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2945 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2955 LD_SB2(src, src_stride, src5, src2);
2957 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2961 LD_SB2(src + 16, src_stride, src11, src8);
2962 src += (2 * src_stride);
2964 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2979 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2984 ST_D1(out, 0, dst + 16);
2989 ST_D1(out, 0, dst + 16);
2999 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000 v16i8 src10_r, src32_r, src76_r, src98_r;
3001 v16i8 src21_r, src43_r, src87_r, src109_r;
3002 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003 v16i8 src10_l, src32_l, src76_l, src98_l;
3004 v16i8 src21_l, src43_l, src87_l, src109_l;
3011 filt =
LD_SH(filter);
3015 LD_SB3(src, src_stride, src0, src1, src2);
3018 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3022 LD_SB3(src + 16, src_stride, src6, src7, src8);
3023 src += (3 * src_stride);
3026 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3029 for (loop_cnt = (height >> 1); loop_cnt--;) {
3031 LD_SB2(src, src_stride, src3, src4);
3033 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3044 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3048 ST_UB(out, dst + dst_stride);
3057 LD_SB2(src + 16, src_stride, src9, src10);
3058 src += (2 * src_stride);
3060 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3071 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3073 ST_UB(out, dst + 16);
3075 ST_UB(out, dst + 16 + dst_stride);
3077 dst += 2 * dst_stride;
3091 const int8_t *filter_x,
3092 const int8_t *filter_y)
3095 v16i8
src0,
src1, src2, src3, src4;
3097 v8i16 filt_h0, filt_h1;
3100 v8i16 filter_vec,
tmp;
3101 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3105 src -= (src_stride + 1);
3107 filter_vec =
LD_SH(filter_x);
3110 filter_vec =
LD_SH(filter_y);
3117 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3120 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3135 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136 tmp = __msa_srari_h(tmp, 6);
3137 tmp = __msa_sat_s_h(tmp, 7);
3139 ST_W2(out, 0, 1, dst, dst_stride);
3146 const int8_t *filter_x,
3147 const int8_t *filter_y)
3150 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3152 v8i16 filt_h0, filt_h1;
3155 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156 v8i16 filter_vec, tmp0, tmp1;
3157 v8i16 dst30, dst41, dst52, dst63;
3158 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159 v4i32 dst0, dst1, dst2, dst3;
3161 src -= (src_stride + 1);
3163 filter_vec =
LD_SH(filter_x);
3166 filter_vec =
LD_SH(filter_y);
3173 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3176 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3193 SRA_4V(dst0, dst1, dst2, dst3, 6);
3198 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3205 const int8_t *filter_x,
3206 const int8_t *filter_y,
3211 v16i8
src0,
src1, src2, src3, src4, src5;
3212 v16i8 src6, src7, src8, src9, src10;
3214 v8i16 filt_h0, filt_h1;
3217 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223 v8i16 dst98_r, dst109_r;
3225 src -= (src_stride + 1);
3227 filter_vec =
LD_SH(filter_x);
3230 filter_vec =
LD_SH(filter_y);
3237 LD_SB3(src, src_stride, src0, src1, src2);
3238 src += (3 * src_stride);
3242 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3247 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3249 for (loop_cnt = height >> 3; loop_cnt--;) {
3251 src3, src4, src5, src6, src7, src8, src9, src10);
3252 src += (8 * src_stride);
3256 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3266 dst32_r = __msa_ilvr_h(dst73, dst22);
3270 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271 dst76_r = __msa_ilvr_h(dst22, dst106);
3281 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3284 dst5_r, dst4_r, dst7_r, dst6_r,
3285 tmp0, tmp1, tmp2, tmp3);
3290 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291 dst += (8 * dst_stride);
3295 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3303 const int8_t *filter_x,
3304 const int8_t *filter_y,
3309 filter_x, filter_y);
3310 }
else if (4 == height) {
3312 filter_x, filter_y);
3313 }
else if (0 == (height % 8)) {
3315 filter_x, filter_y, height);
3323 const int8_t *filter_x,
3324 const int8_t *filter_y,
3327 v16u8 out0, out1, out2;
3328 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3329 v16i8 src7, src8, src9, src10;
3331 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3334 v8i16 filt_h0, filt_h1, filter_vec;
3335 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3345 src -= (src_stride + 1);
3347 filter_vec =
LD_SH(filter_x);
3350 filter_vec =
LD_SH(filter_y);
3357 LD_SB3(src, src_stride, src0, src1, src2);
3358 src += (3 * src_stride);
3362 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3373 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3376 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3386 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3405 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3420 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3434 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3442 const int8_t *filter_x,
3443 const int8_t *filter_y)
3446 v16i8
src0,
src1, src2, src3, src4;
3448 v8i16 filt_h0, filt_h1, filter_vec;
3451 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452 v8i16 dst0, dst1, dst2, dst3, dst4;
3453 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456 v8i16 out0_r, out1_r;
3458 src -= (src_stride + 1);
3460 filter_vec =
LD_SH(filter_x);
3463 filter_vec =
LD_SH(filter_y);
3470 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3492 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3497 ST_D2(out, 0, 1, dst, dst_stride);
3504 const int8_t *filter_x,
3505 const int8_t *filter_y,
3510 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
3511 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3518 src -= (src_stride + 1);
3520 filter_vec =
LD_SH(filter_x);
3523 filter_vec =
LD_SH(filter_y);
3531 for (cnt = width8mult; cnt--;) {
3532 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3536 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3547 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3571 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3574 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575 dst3_r, tmp0, tmp1, tmp2, tmp3);
3580 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3589 const int8_t *filter_x,
3590 const int8_t *filter_y)
3592 v16u8 out0, out1, out2;
3593 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3595 v8i16 filt_h0, filt_h1, filter_vec;
3598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3609 src -= (src_stride + 1);
3611 filter_vec =
LD_SH(filter_x);
3614 filter_vec =
LD_SH(filter_y);
3621 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622 src += (5 * src_stride);
3623 LD_SB4(src, src_stride, src5, src6, src7, src8);
3628 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3670 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3674 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3678 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3684 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3692 const int8_t *filter_x,
3693 const int8_t *filter_y,
3697 uint32_t loop_cnt, cnt;
3701 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3703 v8i16 filt_h0, filt_h1, filter_vec;
3706 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712 v8i16 out0_r, out1_r, out2_r, out3_r;
3714 src -= (src_stride + 1);
3716 filter_vec =
LD_SH(filter_x);
3719 filter_vec =
LD_SH(filter_y);
3726 for (cnt = width8mult; cnt--;) {
3730 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731 src_tmp += (3 * src_stride);
3735 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3746 for (loop_cnt = (height >> 2); loop_cnt--;) {
3747 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748 src_tmp += (4 * src_stride);
3752 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3776 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3780 dst2_l, dst2_r, dst3_l, dst3_r,
3781 out0_r, out1_r, out2_r, out3_r);
3784 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3787 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788 dst_tmp += (4 * dst_stride);
3806 const int8_t *filter_x,
3807 const int8_t *filter_y,
3812 filter_x, filter_y);
3813 }
else if (4 == height) {
3815 filter_x, filter_y, 1);
3816 }
else if (6 == height) {
3818 filter_x, filter_y);
3819 }
else if (0 == (height % 4)) {
3821 filter_x, filter_y, height, 1);
3829 const int8_t *filter_x,
3830 const int8_t *filter_y,
3836 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838 v16i8 mask0, mask1, mask2, mask3;
3839 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3848 src -= (src_stride + 1);
3850 filter_vec =
LD_SH(filter_x);
3853 filter_vec =
LD_SH(filter_y);
3864 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865 src_tmp += (3 * src_stride);
3869 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3880 for (loop_cnt = 4; loop_cnt--;) {
3881 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882 src_tmp += (4 * src_stride);
3885 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3909 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3912 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913 dst3_r, tmp0, tmp1, tmp2, tmp3);
3918 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919 dst_tmp += (4 * dst_stride);
3934 LD_SB3(src, src_stride, src0, src1, src2);
3935 src += (3 * src_stride);
3937 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3944 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3946 for (loop_cnt = 2; loop_cnt--;) {
3948 src3, src4, src5, src6, src7, src8, src9, src10);
3949 src += (8 * src_stride);
3951 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3961 dst32_r = __msa_ilvr_h(dst73, dst22);
3965 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966 dst76_r = __msa_ilvr_h(dst22, dst106);
3976 SRA_4V(dst0, dst1, dst2, dst3, 6);
3977 SRA_4V(dst4, dst5, dst6, dst7, 6);
3978 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979 tmp0, tmp1, tmp2, tmp3);
3984 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985 dst += (8 * dst_stride);
3989 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3997 const int8_t *filter_x,
3998 const int8_t *filter_y,
4006 filter_x, filter_y, height, 2);
4014 const int8_t *filter_x,
4015 const int8_t *filter_y,
4019 filter_x, filter_y, height, 3);
4026 const int8_t *filter_x,
4027 const int8_t *filter_y,
4031 filter_x, filter_y, height, 4);
4034 #define UNI_MC_COPY(WIDTH) \ 4035 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4036 ptrdiff_t dst_stride, \ 4038 ptrdiff_t src_stride, \ 4044 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \ 4057 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4058 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4059 ptrdiff_t dst_stride, \ 4061 ptrdiff_t src_stride, \ 4067 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4069 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4073 UNI_MC(qpel,
h, 4, 8, hz, mx);
4074 UNI_MC(qpel,
h, 8, 8, hz, mx);
4075 UNI_MC(qpel,
h, 12, 8, hz, mx);
4076 UNI_MC(qpel,
h, 16, 8, hz, mx);
4077 UNI_MC(qpel,
h, 24, 8, hz, mx);
4078 UNI_MC(qpel,
h, 32, 8, hz, mx);
4079 UNI_MC(qpel,
h, 48, 8, hz, mx);
4080 UNI_MC(qpel,
h, 64, 8, hz, mx);
4082 UNI_MC(qpel, v, 4, 8, vt, my);
4083 UNI_MC(qpel, v, 8, 8, vt, my);
4084 UNI_MC(qpel, v, 12, 8, vt, my);
4085 UNI_MC(qpel, v, 16, 8, vt, my);
4086 UNI_MC(qpel, v, 24, 8, vt, my);
4087 UNI_MC(qpel, v, 32, 8, vt, my);
4088 UNI_MC(qpel, v, 48, 8, vt, my);
4089 UNI_MC(qpel, v, 64, 8, vt, my);
4091 UNI_MC(epel,
h, 4, 4, hz, mx);
4092 UNI_MC(epel,
h, 6, 4, hz, mx);
4093 UNI_MC(epel,
h, 8, 4, hz, mx);
4094 UNI_MC(epel,
h, 12, 4, hz, mx);
4095 UNI_MC(epel,
h, 16, 4, hz, mx);
4096 UNI_MC(epel,
h, 24, 4, hz, mx);
4097 UNI_MC(epel,
h, 32, 4, hz, mx);
4099 UNI_MC(epel, v, 4, 4, vt, my);
4100 UNI_MC(epel, v, 6, 4, vt, my);
4101 UNI_MC(epel, v, 8, 4, vt, my);
4102 UNI_MC(epel, v, 12, 4, vt, my);
4103 UNI_MC(epel, v, 16, 4, vt, my);
4104 UNI_MC(epel, v, 24, 4, vt, my);
4105 UNI_MC(epel, v, 32, 4, vt, my);
4109 #define UNI_MC_HV(PEL, WIDTH, TAP) \ 4110 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4111 ptrdiff_t dst_stride, \ 4113 ptrdiff_t src_stride, \ 4119 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 4120 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 4122 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4123 filter_x, filter_y, height); \
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST_D2(in, idx0, idx1, pdst, stride)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNPCK_R_SB_SH(in, out)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define SRA_4V(in0, in1, in2, in3, shift)
#define XORI_B4_128_UB(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_D1(in, idx, pdst)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_H2(in, idx0, idx1, pdst, stride)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,idx6, idx7, pdst, stride)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static int aligned(int val)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_W4_SH(...)
#define DPADD_SB2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNI_MC_HV(PEL, WIDTH, TAP)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_H4_SB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t filt[NUMTAPS]
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static const uint8_t ff_hevc_mask_arr[16 *3]
#define UNI_MC_COPY(WIDTH)
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_W2(in, idx0, idx1, pdst, stride)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)