26 int16_t *dst,
int32_t dst_stride,
35 LD_SB2(src, src_stride, src0, src1);
37 src0 = (v16i8) __msa_ilvr_w((v4i32)
src1, (v4i32) src0);
38 in0 = (v8i16) __msa_ilvr_b(zero, src0);
41 }
else if (4 == height) {
45 LD_SB4(src, src_stride, src0, src1, src2, src3);
47 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
51 ST8x4_UB(in0, in1, dst, 2 * dst_stride);
52 }
else if (0 == height % 8) {
53 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
54 v8i16 in0, in1, in2, in3;
57 for (loop_cnt = (height >> 3); loop_cnt--;) {
59 src0, src1, src2, src3, src4, src5, src6, src7);
60 src += (8 * src_stride);
62 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
63 src0, src1, src2, src3);
64 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
67 ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
68 dst += (8 * dst_stride);
74 int16_t *dst,
int32_t dst_stride,
79 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
80 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
82 for (loop_cnt = (height >> 3); loop_cnt--;) {
83 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
84 src += (8 * src_stride);
86 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
88 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
92 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
93 dst += (8 * dst_stride);
98 int16_t *dst,
int32_t dst_stride,
107 LD_SB2(src, src_stride, src0, src1);
112 ST_SH2(in0, in1, dst, dst_stride);
113 }
else if (4 == height) {
115 v8i16 in0, in1, in2, in3;
117 LD_SB4(src, src_stride, src0, src1, src2, src3);
119 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
121 SLLI_4V(in0, in1, in2, in3, 6);
122 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
123 }
else if (6 == height) {
124 v16i8
src0,
src1, src2, src3, src4, src5;
125 v8i16 in0, in1, in2, in3, in4, in5;
127 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
129 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
132 SLLI_4V(in0, in1, in2, in3, 6);
135 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
136 }
else if (0 == height % 8) {
138 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
139 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
141 for (loop_cnt = (height >> 3); loop_cnt--;) {
143 src0, src1, src2, src3, src4, src5, src6, src7);
144 src += (8 * src_stride);
146 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
148 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
150 SLLI_4V(in0, in1, in2, in3, 6);
151 SLLI_4V(in4, in5, in6, in7, 6);
152 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
153 dst += (8 * dst_stride);
159 int16_t *dst,
int32_t dst_stride,
164 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
165 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
167 for (loop_cnt = (height >> 3); loop_cnt--;) {
168 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
171 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
172 in0_r, in1_r, in2_r, in3_r);
173 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
174 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
178 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
179 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
180 dst += (4 * dst_stride);
182 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
183 in0_r, in1_r, in2_r, in3_r);
184 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
185 ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
189 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
190 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
191 dst += (4 * dst_stride);
204 uint32_t loop_cnt, cnt;
206 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
207 v8i16 in0_r, in1_r, in2_r, in3_r;
208 v8i16 in0_l, in1_l, in2_l, in3_l;
210 for (cnt = (width >> 4); cnt--;) {
214 for (loop_cnt = (height >> 3); loop_cnt--;) {
215 LD_SB8(src_tmp, src_stride,
216 src0, src1, src2, src3, src4, src5, src6, src7);
217 src_tmp += (8 * src_stride);
219 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
220 in0_r, in1_r, in2_r, in3_r);
221 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
222 in0_l, in1_l, in2_l, in3_l);
223 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
224 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
225 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
226 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
227 dst_tmp += (4 * dst_stride);
229 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
230 in0_r, in1_r, in2_r, in3_r);
231 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
232 in0_l, in1_l, in2_l, in3_l);
233 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
234 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
235 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
236 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
237 dst_tmp += (4 * dst_stride);
246 int16_t *dst,
int32_t dst_stride,
253 v8i16 in0_r, in1_r, in2_r, in3_r;
254 v8i16 in0_l, in1_l, in2_l, in3_l;
256 LD_SB4(src, src_stride, src0, src1, src2, src3);
258 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
259 in0_r, in1_r, in2_r, in3_r);
260 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
261 in0_l, in1_l, in2_l, in3_l);
262 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
263 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
264 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
265 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
266 }
else if (12 == height) {
267 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
268 v16i8 src8, src9, src10, src11;
269 v8i16 in0_r, in1_r, in2_r, in3_r;
270 v8i16 in0_l, in1_l, in2_l, in3_l;
272 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
273 src += (8 * src_stride);
274 LD_SB4(src, src_stride, src8, src9, src10, src11);
276 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
277 in0_r, in1_r, in2_r, in3_r);
278 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
279 in0_l, in1_l, in2_l, in3_l);
280 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
281 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
282 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
283 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
284 dst += (4 * dst_stride);
286 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
287 in0_r, in1_r, in2_r, in3_r);
288 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
289 in0_l, in1_l, in2_l, in3_l);
290 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
291 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
292 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
293 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
294 dst += (4 * dst_stride);
296 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
297 in0_r, in1_r, in2_r, in3_r);
298 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
299 in0_l, in1_l, in2_l, in3_l);
300 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
301 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
302 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
303 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
304 }
else if (0 == (height % 8)) {
311 int16_t *dst,
int32_t dst_stride,
319 int16_t *dst,
int32_t dst_stride,
326 int16_t *dst,
int32_t dst_stride,
333 int16_t *dst,
int32_t dst_stride,
340 int16_t *dst,
int32_t dst_stride,
344 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
345 v8i16 filt0, filt1, filt2, filt3;
346 v16i8 mask1, mask2, mask3;
347 v16i8 vec0, vec1, vec2, vec3;
348 v8i16 dst0, dst1, dst2, dst3;
349 v8i16 filter_vec, const_vec;
350 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
353 const_vec = __msa_ldi_h(128);
356 filter_vec =
LD_SH(filter);
357 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
363 for (loop_cnt = (height >> 3); loop_cnt--;) {
364 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
365 src += (8 * src_stride);
368 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
369 vec0, vec1, vec2, vec3);
371 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
372 dst0, dst0, dst0, dst0);
373 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
374 vec0, vec1, vec2, vec3);
376 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
377 dst1, dst1, dst1, dst1);
378 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
379 vec0, vec1, vec2, vec3);
381 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
382 dst2, dst2, dst2, dst2);
383 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
384 vec0, vec1, vec2, vec3);
386 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
387 dst3, dst3, dst3, dst3);
389 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
390 dst += (8 * dst_stride);
395 int16_t *dst,
int32_t dst_stride,
400 v8i16 filt0, filt1, filt2, filt3;
401 v16i8 mask1, mask2, mask3;
402 v16i8 vec0, vec1, vec2, vec3;
403 v8i16 dst0, dst1, dst2, dst3;
404 v8i16 filter_vec, const_vec;
405 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
408 const_vec = __msa_ldi_h(128);
411 filter_vec =
LD_SH(filter);
412 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
418 for (loop_cnt = (height >> 2); loop_cnt--;) {
419 LD_SB4(src, src_stride, src0, src1, src2, src3);
420 src += (4 * src_stride);
423 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
424 vec0, vec1, vec2, vec3);
426 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
427 dst0, dst0, dst0, dst0);
428 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
429 vec0, vec1, vec2, vec3);
431 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
432 dst1, dst1, dst1, dst1);
433 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
434 vec0, vec1, vec2, vec3);
436 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
437 dst2, dst2, dst2, dst2);
438 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
439 vec0, vec1, vec2, vec3);
441 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
442 dst3, dst3, dst3, dst3);
444 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
445 dst += (4 * dst_stride);
450 int16_t *dst,
int32_t dst_stride,
458 int16_t *dst,
int32_t dst_stride,
462 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
463 v8i16 filt0, filt1, filt2, filt3;
464 v16i8 mask1, mask2, mask3;
465 v16i8 vec0, vec1, vec2, vec3;
466 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
467 v8i16 filter_vec, const_vec;
468 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
471 const_vec = __msa_ldi_h(128);
474 filter_vec =
LD_SH(filter);
475 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
481 for (loop_cnt = (height >> 2); loop_cnt--;) {
482 LD_SB4(src, src_stride, src0, src2, src4, src6);
483 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
484 src += (4 * src_stride);
487 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
488 vec0, vec1, vec2, vec3);
490 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
491 dst0, dst0, dst0, dst0);
492 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
493 vec0, vec1, vec2, vec3);
495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
496 dst1, dst1, dst1, dst1);
497 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
498 vec0, vec1, vec2, vec3);
500 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
501 dst2, dst2, dst2, dst2);
502 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
503 vec0, vec1, vec2, vec3);
505 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
506 dst3, dst3, dst3, dst3);
507 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
508 vec0, vec1, vec2, vec3);
510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
511 dst4, dst4, dst4, dst4);
512 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
513 vec0, vec1, vec2, vec3);
515 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
516 dst5, dst5, dst5, dst5);
517 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
518 vec0, vec1, vec2, vec3);
520 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
521 dst6, dst6, dst6, dst6);
522 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
523 vec0, vec1, vec2, vec3);
525 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
526 dst7, dst7, dst7, dst7);
528 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
529 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
530 dst += (4 * dst_stride);
535 int16_t *dst,
int32_t dst_stride,
540 v8i16 filt0, filt1, filt2, filt3;
541 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
542 v16i8 vec0, vec1, vec2, vec3;
543 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
544 v8i16 filter_vec, const_vec;
545 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
548 filter_vec =
LD_SH(filter);
549 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
559 const_vec = __msa_ldi_h(128);
562 for (loop_cnt = (height >> 1); loop_cnt--;) {
563 LD_SB2(src, 16, src0, src1);
565 LD_SB2(src, 16, src2, src3);
569 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
570 vec0, vec1, vec2, vec3);
572 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
573 dst0, dst0, dst0, dst0);
574 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
575 vec0, vec1, vec2, vec3);
577 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
578 dst1, dst1, dst1, dst1);
579 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
580 vec0, vec1, vec2, vec3);
582 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
583 dst2, dst2, dst2, dst2);
584 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
585 vec0, vec1, vec2, vec3);
587 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
588 dst3, dst3, dst3, dst3);
589 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
590 vec0, vec1, vec2, vec3);
592 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
593 dst4, dst4, dst4, dst4);
594 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
595 vec0, vec1, vec2, vec3);
597 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
598 dst5, dst5, dst5, dst5);
600 ST_SH2(dst0, dst1, dst, 8);
601 ST_SH(dst2, dst + 16);
603 ST_SH2(dst3, dst4, dst, 8);
604 ST_SH(dst5, dst + 16);
610 int16_t *dst,
int32_t dst_stride,
615 v8i16 filt0, filt1, filt2, filt3;
616 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
617 v16i8 vec0, vec1, vec2, vec3;
618 v8i16 dst0, dst1, dst2, dst3;
619 v8i16 filter_vec, const_vec;
620 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
623 filter_vec =
LD_SH(filter);
624 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
634 const_vec = __msa_ldi_h(128);
637 for (loop_cnt = height; loop_cnt--;) {
638 LD_SB2(src, 16, src0, src1);
639 src2 =
LD_SB(src + 24);
643 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
644 vec0, vec1, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
647 dst0, dst0, dst0, dst0);
648 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
649 vec0, vec1, vec2, vec3);
651 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
652 dst1, dst1, dst1, dst1);
653 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
654 vec0, vec1, vec2, vec3);
656 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
657 dst2, dst2, dst2, dst2);
658 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
659 vec0, vec1, vec2, vec3);
661 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
662 dst3, dst3, dst3, dst3);
664 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
670 int16_t *dst,
int32_t dst_stride,
675 v8i16 filt0, filt1, filt2, filt3;
676 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
677 v16i8 vec0, vec1, vec2, vec3;
678 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
679 v8i16 filter_vec, const_vec;
680 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
683 filter_vec =
LD_SH(filter);
684 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
694 const_vec = __msa_ldi_h(128);
697 for (loop_cnt = height; loop_cnt--;) {
698 LD_SB3(src, 16, src0, src1, src2);
699 src3 =
LD_SB(src + 40);
703 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
704 vec0, vec1, vec2, vec3);
706 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
707 dst0, dst0, dst0, dst0);
708 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
709 vec0, vec1, vec2, vec3);
711 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
712 dst1, dst1, dst1, dst1);
713 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
714 vec0, vec1, vec2, vec3);
716 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
717 dst2, dst2, dst2, dst2);
718 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
719 vec0, vec1, vec2, vec3);
721 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
722 dst3, dst3, dst3, dst3);
723 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
724 vec0, vec1, vec2, vec3);
726 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
727 dst4, dst4, dst4, dst4);
728 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
729 vec0, vec1, vec2, vec3);
731 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
732 dst5, dst5, dst5, dst5);
734 ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8);
740 int16_t *dst,
int32_t dst_stride,
745 v8i16 filt0, filt1, filt2, filt3;
746 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
747 v16i8 vec0, vec1, vec2, vec3;
748 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
749 v8i16 filter_vec, const_vec;
750 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
754 filter_vec =
LD_SH(filter);
755 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
765 const_vec = __msa_ldi_h(128);
768 for (loop_cnt = height; loop_cnt--;) {
769 LD_SB4(src, 16, src0, src1, src2, src3);
770 src4 =
LD_SB(src + 56);
774 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
775 vec0, vec1, vec2, vec3);
777 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
778 dst0, dst0, dst0, dst0);
781 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
782 vec0, vec1, vec2, vec3);
784 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
785 dst1, dst1, dst1, dst1);
786 ST_SH(dst1, dst + 8);
788 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
789 vec0, vec1, vec2, vec3);
791 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
792 dst2, dst2, dst2, dst2);
793 ST_SH(dst2, dst + 16);
795 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
796 vec0, vec1, vec2, vec3);
798 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
799 dst3, dst3, dst3, dst3);
800 ST_SH(dst3, dst + 24);
802 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
803 vec0, vec1, vec2, vec3);
805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
806 dst4, dst4, dst4, dst4);
807 ST_SH(dst4, dst + 32);
809 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
810 vec0, vec1, vec2, vec3);
812 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
813 dst5, dst5, dst5, dst5);
814 ST_SH(dst5, dst + 40);
816 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
817 vec0, vec1, vec2, vec3);
819 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
820 dst6, dst6, dst6, dst6);
821 ST_SH(dst6, dst + 48);
823 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
824 vec0, vec1, vec2, vec3);
826 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
827 dst7, dst7, dst7, dst7);
828 ST_SH(dst7, dst + 56);
834 int16_t *dst,
int32_t dst_stride,
838 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
839 v16i8 src9, src10, src11, src12, src13, src14;
840 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
841 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
842 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
843 v16i8 src2110, src4332, src6554, src8776, src10998;
844 v16i8 src12111110, src14131312;
845 v8i16 dst10, dst32, dst54, dst76;
846 v8i16 filt0, filt1, filt2, filt3;
847 v8i16 filter_vec, const_vec;
849 src -= (3 * src_stride);
851 const_vec = __msa_ldi_h(128);
854 filter_vec =
LD_SH(filter);
855 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
857 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
858 src += (7 * src_stride);
859 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
860 src10_r, src32_r, src54_r, src21_r);
861 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
862 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
863 src2110, src4332, src6554);
866 for (loop_cnt = (height >> 3); loop_cnt--;) {
868 src7, src8, src9, src10, src11, src12, src13, src14);
869 src += (8 * src_stride);
871 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
872 src76_r, src87_r, src98_r, src109_r);
873 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
874 src1110_r, src1211_r, src1312_r, src1413_r);
875 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
876 src1211_r, src1110_r, src1413_r, src1312_r,
877 src8776, src10998, src12111110, src14131312);
882 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
885 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
888 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
890 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
891 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
893 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
894 dst += (8 * dst_stride);
897 src4332 = src12111110;
898 src6554 = src14131312;
904 int16_t *dst,
int32_t dst_stride,
908 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
909 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
910 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
911 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
912 v8i16 filter_vec, const_vec;
913 v8i16 filt0, filt1, filt2, filt3;
915 src -= (3 * src_stride);
916 const_vec = __msa_ldi_h(128);
919 filter_vec =
LD_SH(filter);
920 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
922 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
923 src += (7 * src_stride);
925 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
926 src10_r, src32_r, src54_r, src21_r);
927 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
929 for (loop_cnt = (height >> 2); loop_cnt--;) {
930 LD_SB4(src, src_stride, src7, src8, src9, src10);
931 src += (4 * src_stride);
933 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
934 src76_r, src87_r, src98_r, src109_r);
938 filt0, filt1, filt2, filt3,
939 dst0_r, dst0_r, dst0_r, dst0_r);
942 filt0, filt1, filt2, filt3,
943 dst1_r, dst1_r, dst1_r, dst1_r);
946 filt0, filt1, filt2, filt3,
947 dst2_r, dst2_r, dst2_r, dst2_r);
950 filt0, filt1, filt2, filt3,
951 dst3_r, dst3_r, dst3_r, dst3_r);
953 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
954 dst += (4 * dst_stride);
967 int16_t *dst,
int32_t dst_stride,
971 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
972 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
973 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
974 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
975 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
976 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
977 v16i8 src2110, src4332, src6554, src8776, src10998;
978 v8i16 dst0_l, dst1_l;
979 v8i16 filter_vec, const_vec;
980 v8i16 filt0, filt1, filt2, filt3;
982 src -= (3 * src_stride);
983 const_vec = __msa_ldi_h(128);
986 filter_vec =
LD_SH(filter);
987 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
989 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
990 src += (7 * src_stride);
992 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
993 src10_r, src32_r, src54_r, src21_r);
994 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
995 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
996 src10_l, src32_l, src54_l, src21_l);
997 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
998 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
999 src2110, src4332, src6554);
1001 for (loop_cnt = (height >> 2); loop_cnt--;) {
1002 LD_SB4(src, src_stride, src7, src8, src9, src10);
1003 src += (4 * src_stride);
1005 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1006 src76_r, src87_r, src98_r, src109_r);
1007 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1008 src76_l, src87_l, src98_l, src109_l);
1009 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1013 filt0, filt1, filt2, filt3,
1014 dst0_r, dst0_r, dst0_r, dst0_r);
1017 filt0, filt1, filt2, filt3,
1018 dst1_r, dst1_r, dst1_r, dst1_r);
1021 filt0, filt1, filt2, filt3,
1022 dst2_r, dst2_r, dst2_r, dst2_r);
1025 filt0, filt1, filt2, filt3,
1026 dst3_r, dst3_r, dst3_r, dst3_r);
1029 filt0, filt1, filt2, filt3,
1030 dst0_l, dst0_l, dst0_l, dst0_l);
1033 filt0, filt1, filt2, filt3,
1034 dst1_l, dst1_l, dst1_l, dst1_l);
1036 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1037 ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
1038 dst += (4 * dst_stride);
1064 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1065 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1066 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1067 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1068 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1069 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1070 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1071 v8i16 filter_vec, const_vec;
1072 v8i16 filt0, filt1, filt2, filt3;
1074 src -= (3 * src_stride);
1075 const_vec = __msa_ldi_h(128);
1078 filter_vec =
LD_SH(filter);
1079 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1081 for (cnt = width >> 4; cnt--;) {
1085 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1086 src_tmp += (7 * src_stride);
1088 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1089 src10_r, src32_r, src54_r, src21_r);
1090 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1091 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1092 src10_l, src32_l, src54_l, src21_l);
1093 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1095 for (loop_cnt = (height >> 2); loop_cnt--;) {
1096 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1097 src_tmp += (4 * src_stride);
1099 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1100 src76_r, src87_r, src98_r, src109_r);
1101 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1102 src76_l, src87_l, src98_l, src109_l);
1106 filt0, filt1, filt2, filt3,
1107 dst0_r, dst0_r, dst0_r, dst0_r);
1110 filt0, filt1, filt2, filt3,
1111 dst1_r, dst1_r, dst1_r, dst1_r);
1114 filt0, filt1, filt2, filt3,
1115 dst2_r, dst2_r, dst2_r, dst2_r);
1118 filt0, filt1, filt2, filt3,
1119 dst3_r, dst3_r, dst3_r, dst3_r);
1122 filt0, filt1, filt2, filt3,
1123 dst0_l, dst0_l, dst0_l, dst0_l);
1126 filt0, filt1, filt2, filt3,
1127 dst1_l, dst1_l, dst1_l, dst1_l);
1130 filt0, filt1, filt2, filt3,
1131 dst2_l, dst2_l, dst2_l, dst2_l);
1134 filt0, filt1, filt2, filt3,
1135 dst3_l, dst3_l, dst3_l, dst3_l);
1137 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1138 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1139 dst_tmp += (4 * dst_stride);
1162 int16_t *dst,
int32_t dst_stride,
1166 filter, height, 16);
1170 int16_t *dst,
int32_t dst_stride,
1174 filter, height, 16);
1180 int16_t *dst,
int32_t dst_stride,
1184 filter, height, 32);
1188 int16_t *dst,
int32_t dst_stride,
1192 filter, height, 48);
1196 int16_t *dst,
int32_t dst_stride,
1200 filter, height, 64);
1204 int16_t *dst,
int32_t dst_stride,
1205 const int8_t *filter_x,
const int8_t *filter_y,
1209 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1210 v8i16 filt0, filt1, filt2, filt3;
1211 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1212 v16i8 mask1, mask2, mask3;
1213 v8i16 filter_vec, const_vec;
1214 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1215 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1216 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1217 v4i32 dst0_r, dst1_r;
1218 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1219 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1221 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
1223 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1225 src -= ((3 * src_stride) + 3);
1226 filter_vec =
LD_SH(filter_x);
1227 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1229 filter_vec =
LD_SH(filter_y);
1230 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1231 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1233 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1239 const_vec = __msa_ldi_h(128);
1242 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1243 src += (7 * src_stride);
1246 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1247 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1248 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1249 vec8, vec9, vec10, vec11);
1250 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1251 vec12, vec13, vec14, vec15);
1253 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1254 dst30, dst30, dst30, dst30);
1256 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1257 dst41, dst41, dst41, dst41);
1259 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1260 dst52, dst52, dst52, dst52);
1262 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1263 dst63, dst63, dst63, dst63);
1265 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1266 dst10_r, dst21_r, dst32_r);
1267 dst43_r = __msa_ilvl_h(dst41, dst30);
1268 dst54_r = __msa_ilvl_h(dst52, dst41);
1269 dst65_r = __msa_ilvl_h(dst63, dst52);
1270 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1272 for (loop_cnt = height >> 1; loop_cnt--;) {
1273 LD_SB2(src, src_stride, src7, src8);
1274 src += (2 * src_stride);
1277 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1278 vec0, vec1, vec2, vec3);
1280 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1281 dst87, dst87, dst87, dst87);
1282 dst76_r = __msa_ilvr_h(dst87, dst66);
1284 filt_h0, filt_h1, filt_h2, filt_h3);
1285 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1287 filt_h0, filt_h1, filt_h2, filt_h3);
1291 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
1292 ST8x2_UB(dst0_r, dst, (2 * dst_stride));
1293 dst += (2 * dst_stride);
1301 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1309 const int8_t *filter_x,
1310 const int8_t *filter_y,
1313 uint32_t loop_cnt, cnt;
1316 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1317 v8i16 filt0, filt1, filt2, filt3;
1318 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1319 v16i8 mask1, mask2, mask3;
1320 v8i16 filter_vec, const_vec;
1321 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1322 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1323 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1324 v4i32 dst0_r, dst0_l;
1325 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1326 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1327 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1328 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1329 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1331 src -= ((3 * src_stride) + 3);
1332 filter_vec =
LD_SH(filter_x);
1333 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1335 filter_vec =
LD_SH(filter_y);
1336 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1337 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1339 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1345 const_vec = __msa_ldi_h(128);
1348 for (cnt = width >> 3; cnt--;) {
1352 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1353 src_tmp += (7 * src_stride);
1357 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1358 vec0, vec1, vec2, vec3);
1359 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1360 vec4, vec5, vec6, vec7);
1361 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1362 vec8, vec9, vec10, vec11);
1363 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1364 vec12, vec13, vec14, vec15);
1366 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1367 dst0, dst0, dst0, dst0);
1369 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1370 dst1, dst1, dst1, dst1);
1372 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1373 dst2, dst2, dst2, dst2);
1375 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1376 dst3, dst3, dst3, dst3);
1379 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1380 vec0, vec1, vec2, vec3);
1381 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1382 vec4, vec5, vec6, vec7);
1383 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1384 vec8, vec9, vec10, vec11);
1386 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1387 dst4, dst4, dst4, dst4);
1389 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1390 dst5, dst5, dst5, dst5);
1392 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1393 dst6, dst6, dst6, dst6);
1395 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1396 dst10_r, dst32_r, dst54_r, dst21_r);
1397 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1398 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1399 dst10_l, dst32_l, dst54_l, dst21_l);
1400 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1402 for (loop_cnt = height >> 1; loop_cnt--;) {
1403 LD_SB2(src_tmp, src_stride, src7, src8);
1405 src_tmp += 2 * src_stride;
1407 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1408 vec0, vec1, vec2, vec3);
1410 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1411 dst7, dst7, dst7, dst7);
1415 filt_h0, filt_h1, filt_h2, filt_h3);
1417 filt_h0, filt_h1, filt_h2, filt_h3);
1421 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1422 ST_SW(dst0_r, dst_tmp);
1423 dst_tmp += dst_stride;
1425 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1426 vec0, vec1, vec2, vec3);
1428 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1429 dst8, dst8, dst8, dst8);
1434 filt_h0, filt_h1, filt_h2, filt_h3);
1436 filt_h0, filt_h1, filt_h2, filt_h3);
1440 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1441 ST_SW(dst0_r, dst_tmp);
1442 dst_tmp += dst_stride;
1464 int16_t *dst,
int32_t dst_stride,
1465 const int8_t *filter_x,
const int8_t *filter_y,
1469 filter_x, filter_y, height, 8);
1473 int16_t *dst,
int32_t dst_stride,
1474 const int8_t *filter_x,
const int8_t *filter_y,
1478 filter_x, filter_y, height, 8);
1481 filter_x, filter_y, height);
1485 int16_t *dst,
int32_t dst_stride,
1486 const int8_t *filter_x,
const int8_t *filter_y,
1490 filter_x, filter_y, height, 16);
1494 int16_t *dst,
int32_t dst_stride,
1495 const int8_t *filter_x,
const int8_t *filter_y,
1499 filter_x, filter_y, height, 24);
1503 int16_t *dst,
int32_t dst_stride,
1504 const int8_t *filter_x,
const int8_t *filter_y,
1508 filter_x, filter_y, height, 32);
1512 int16_t *dst,
int32_t dst_stride,
1513 const int8_t *filter_x,
const int8_t *filter_y,
1517 filter_x, filter_y, height, 48);
1521 int16_t *dst,
int32_t dst_stride,
1522 const int8_t *filter_x,
const int8_t *filter_y,
1526 filter_x, filter_y, height, 64);
1537 v16i8 mask1, vec0, vec1;
1539 v8i16 filter_vec, const_vec;
1540 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1544 filter_vec =
LD_SH(filter);
1549 const_vec = __msa_ldi_h(128);
1552 LD_SB2(src, src_stride, src0, src1);
1555 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1559 ST8x2_UB(dst0, dst, 2 * dst_stride);
1570 v16i8 mask1, vec0, vec1;
1572 v8i16 filter_vec, const_vec;
1573 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1577 filter_vec =
LD_SH(filter);
1582 const_vec = __msa_ldi_h(128);
1585 LD_SB4(src, src_stride, src0, src1, src2, src3);
1588 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1592 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1596 ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
1608 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1609 v16i8 mask1, vec0, vec1;
1610 v8i16 dst0, dst1, dst2, dst3;
1611 v8i16 filter_vec, const_vec;
1612 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1616 filter_vec =
LD_SH(filter);
1621 const_vec = __msa_ldi_h(128);
1624 for (loop_cnt = (height >> 3); loop_cnt--;) {
1625 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1626 src += (8 * src_stride);
1630 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1633 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1636 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1639 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1643 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
1644 dst += (8 * dst_stride);
1657 }
else if (4 == height) {
1659 }
else if (0 == height % 8) {
1673 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1674 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1675 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1677 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1680 v8i16 filter_vec, const_vec;
1684 filter_vec =
LD_SH(filter);
1689 const_vec = __msa_ldi_h(128);
1692 for (loop_cnt = (height >> 2); loop_cnt--;) {
1693 LD_SB4(src, src_stride, src0, src1, src2, src3);
1694 src += (4 * src_stride);
1698 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1701 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1704 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1707 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1711 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1712 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1713 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
1714 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
1716 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
1717 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
1718 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
1719 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
1722 SW(dst_val_int0, dst + 4);
1725 SW(dst_val_int1, dst + 4);
1728 SW(dst_val_int2, dst + 4);
1731 SW(dst_val_int3, dst + 4);
1744 v8i16 filt0, filt1, dst0, dst1;
1746 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1749 v8i16 filter_vec, const_vec;
1753 filter_vec =
LD_SH(filter);
1758 const_vec = __msa_ldi_h(128);
1761 for (loop_cnt = (height >> 1); loop_cnt--;) {
1762 LD_SB2(src, src_stride, src0, src1);
1763 src += (2 * src_stride);
1767 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1771 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1775 ST_SH2(dst0, dst1, dst, dst_stride);
1776 dst += (2 * dst_stride);
1790 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1793 v8i16 dst0, dst1, dst2, dst3;
1794 v8i16 filter_vec, const_vec;
1798 filter_vec =
LD_SH(filter);
1803 const_vec = __msa_ldi_h(128);
1806 for (loop_cnt = (height >> 2); loop_cnt--;) {
1807 LD_SB4(src, src_stride, src0, src1, src2, src3);
1808 src += (4 * src_stride);
1812 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1816 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1820 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1824 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1828 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
1829 dst += (4 * dst_stride);
1840 if (2 == height || 6 == height) {
1861 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1862 v8i16 filter_vec, const_vec;
1864 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1866 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
1871 filter_vec =
LD_SH(filter);
1877 const_vec = __msa_ldi_h(128);
1880 for (loop_cnt = (height >> 2); loop_cnt--;) {
1881 LD_SB4(src, src_stride, src0, src1, src2, src3);
1882 src += (4 * src_stride);
1885 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1888 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1891 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1894 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1897 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
1900 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
1904 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
1905 ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
1906 dst += (4 * dst_stride);
1919 v16i8 src4, src5, src6, src7;
1921 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1923 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1925 v8i16 filter_vec, const_vec;
1929 filter_vec =
LD_SH(filter);
1934 const_vec = __msa_ldi_h(128);
1937 for (loop_cnt = (height >> 2); loop_cnt--;) {
1938 LD_SB4(src, src_stride, src0, src2, src4, src6);
1939 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1940 src += (4 * src_stride);
1944 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1948 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1952 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1956 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1960 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
1964 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
1968 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
1972 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
1976 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
1977 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
1978 dst += (4 * dst_stride);
1990 int16_t *dst_tmp = dst + 16;
1991 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1993 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1994 v16i8 mask1, mask00, mask11;
1996 v8i16 dst0, dst1, dst2, dst3;
1997 v8i16 filter_vec, const_vec;
2001 filter_vec =
LD_SH(filter);
2006 mask11 = mask0 + 10;
2008 const_vec = __msa_ldi_h(128);
2011 for (loop_cnt = (height >> 2); loop_cnt--;) {
2013 LD_SB4(src, src_stride, src0, src2, src4, src6);
2014 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2015 src += (4 * src_stride);
2019 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2023 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2027 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2031 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2035 ST_SH2(dst0, dst1, dst, 8);
2037 ST_SH2(dst2, dst3, dst, 8);
2040 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2044 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2048 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2052 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2056 ST_SH2(dst0, dst1, dst, 8);
2058 ST_SH2(dst2, dst3, dst, 8);
2062 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2066 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2070 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2074 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2078 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2079 dst_tmp += (4 * dst_stride);
2093 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2094 v16i8 mask1, mask2, mask3;
2095 v8i16 dst0, dst1, dst2, dst3;
2097 v8i16 filter_vec, const_vec;
2101 filter_vec =
LD_SH(filter);
2104 const_vec = __msa_ldi_h(128);
2111 for (loop_cnt = (height >> 1); loop_cnt--;) {
2112 LD_SB2(src, 16, src0, src1);
2113 src2 =
LD_SB(src + 24);
2118 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2122 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2126 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2130 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2134 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2137 LD_SB2(src, 16, src0, src1);
2138 src2 =
LD_SB(src + 24);
2143 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2147 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2151 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2155 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2159 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2170 v16i8
src0,
src1, src2, src3, src4;
2171 v16i8 src10_r, src32_r, src21_r, src43_r;
2172 v16i8 src2110, src4332;
2175 v8i16 filter_vec, const_vec;
2179 const_vec = __msa_ldi_h(128);
2182 filter_vec =
LD_SH(filter);
2185 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2186 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2187 src10_r, src21_r, src32_r, src43_r);
2189 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2192 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2194 ST8x2_UB(dst10, dst, 2 * dst_stride);
2204 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2205 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2206 v16i8 src2110, src4332, src6554;
2209 v8i16 filter_vec, const_vec;
2213 const_vec = __msa_ldi_h(128);
2216 filter_vec =
LD_SH(filter);
2219 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2220 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2221 src10_r, src21_r, src32_r, src43_r);
2222 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2223 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2224 src2110, src4332, src6554);
2227 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2229 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2231 ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
2242 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
2243 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2244 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2245 v16i8 src2110, src4332, src6554, src8776;
2246 v8i16 dst10, dst32, dst54, dst76;
2248 v8i16 filter_vec, const_vec;
2251 const_vec = __msa_ldi_h(128);
2254 filter_vec =
LD_SH(filter);
2257 LD_SB3(src, src_stride, src0, src1, src2);
2258 src += (3 * src_stride);
2260 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2261 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2262 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2264 for (loop_cnt = (height >> 3); loop_cnt--;) {
2265 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
2266 src += (6 * src_stride);
2268 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2269 src32_r, src43_r, src54_r, src65_r);
2270 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2271 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2272 src4332, src6554, src8776);
2276 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2278 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2280 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2282 LD_SB2(src, src_stride, src9, src2);
2283 src += (2 * src_stride);
2284 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2285 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2286 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2288 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2290 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2291 dst += (8 * dst_stride);
2304 }
else if (4 == height) {
2306 }
else if (0 == (height % 8)) {
2320 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2321 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2322 v16i8
src0,
src1, src2, src3, src4;
2323 v16i8 src10_r, src32_r, src21_r, src43_r;
2324 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2326 v8i16 filter_vec, const_vec;
2329 const_vec = __msa_ldi_h(128);
2332 filter_vec =
LD_SH(filter);
2335 LD_SB3(src, src_stride, src0, src1, src2);
2336 src += (3 * src_stride);
2338 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2340 for (loop_cnt = (height >> 2); loop_cnt--;) {
2341 LD_SB2(src, src_stride, src3, src4);
2342 src += (2 * src_stride);
2344 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2347 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2349 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2351 LD_SB2(src, src_stride, src1, src2);
2352 src += (2 * src_stride);
2354 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2357 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2359 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2361 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2362 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2363 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2364 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2366 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2367 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2368 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2369 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2372 SW(dst_val_int0, dst + 4);
2375 SW(dst_val_int1, dst + 4);
2378 SW(dst_val_int2, dst + 4);
2381 SW(dst_val_int3, dst + 4);
2392 v16i8
src0,
src1, src2, src3, src4;
2393 v16i8 src10_r, src32_r, src21_r, src43_r;
2394 v8i16 dst0_r, dst1_r;
2396 v8i16 filter_vec, const_vec;
2399 const_vec = __msa_ldi_h(128);
2402 filter_vec =
LD_SH(filter);
2405 LD_SB3(src, src_stride, src0, src1, src2);
2406 src += (3 * src_stride);
2408 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2410 LD_SB2(src, src_stride, src3, src4);
2412 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2414 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2416 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2418 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2427 v16i8
src0,
src1, src2, src3, src4;
2428 v16i8 src10_r, src32_r, src21_r, src43_r;
2429 v8i16 dst0_r, dst1_r;
2431 v8i16 filter_vec, const_vec;
2434 const_vec = __msa_ldi_h(128);
2437 filter_vec =
LD_SH(filter);
2440 LD_SB3(src, src_stride, src0, src1, src2);
2441 src += (3 * src_stride);
2443 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2445 LD_SB2(src, src_stride, src3, src4);
2446 src += (2 * src_stride);
2449 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2451 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2453 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2455 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2456 dst += (2 * dst_stride);
2458 LD_SB2(src, src_stride, src1, src2);
2459 src += (2 * src_stride);
2462 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2464 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2466 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2468 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2469 dst += (2 * dst_stride);
2471 LD_SB2(src, src_stride, src3, src4);
2474 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2476 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2478 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2480 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2491 v16i8
src0,
src1, src2, src3, src4, src5;
2492 v16i8 src10_r, src32_r, src21_r, src43_r;
2493 v8i16 dst0_r, dst1_r;
2495 v8i16 filter_vec, const_vec;
2498 const_vec = __msa_ldi_h(128);
2501 filter_vec =
LD_SH(filter);
2504 LD_SB3(src, src_stride, src0, src1, src2);
2505 src += (3 * src_stride);
2507 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2509 for (loop_cnt = (height >> 2); loop_cnt--;) {
2510 LD_SB2(src, src_stride, src3, src4);
2511 src += (2 * src_stride);
2513 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2515 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2517 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2519 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2520 dst += (2 * dst_stride);
2522 LD_SB2(src, src_stride, src5, src2);
2523 src += (2 * src_stride);
2525 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2527 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2529 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2531 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2532 dst += (2 * dst_stride);
2545 }
else if (6 == height) {
2561 v16i8
src0,
src1, src2, src3, src4, src5;
2562 v16i8 src10_r, src32_r, src21_r, src43_r;
2563 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2564 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2565 v16i8 src2110, src4332;
2566 v8i16 dst0_l, dst1_l;
2568 v8i16 filter_vec, const_vec;
2570 src -= (1 * src_stride);
2571 const_vec = __msa_ldi_h(128);
2574 filter_vec =
LD_SH(filter);
2577 LD_SB3(src, src_stride, src0, src1, src2);
2578 src += (3 * src_stride);
2580 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2581 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2582 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2584 for (loop_cnt = (height >> 2); loop_cnt--;) {
2585 LD_SB2(src, src_stride, src3, src4);
2586 src += (2 * src_stride);
2588 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2589 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2590 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2592 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2594 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2596 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2598 LD_SB2(src, src_stride, src5, src2);
2599 src += (2 * src_stride);
2601 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2602 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
2603 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2605 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2607 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2609 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
2611 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2612 ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
2613 dst += (4 * dst_stride);
2625 v16i8
src0,
src1, src2, src3, src4, src5;
2626 v16i8 src10_r, src32_r, src21_r, src43_r;
2627 v16i8 src10_l, src32_l, src21_l, src43_l;
2628 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2630 v8i16 filter_vec, const_vec;
2633 const_vec = __msa_ldi_h(128);
2636 filter_vec =
LD_SH(filter);
2639 LD_SB3(src, src_stride, src0, src1, src2);
2640 src += (3 * src_stride);
2642 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2643 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2645 for (loop_cnt = (height >> 2); loop_cnt--;) {
2646 LD_SB2(src, src_stride, src3, src4);
2647 src += (2 * src_stride);
2649 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2650 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2652 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2654 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2656 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2658 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2659 ST_SH2(dst0_r, dst0_l, dst, 8);
2661 ST_SH2(dst1_r, dst1_l, dst, 8);
2664 LD_SB2(src, src_stride, src5, src2);
2665 src += (2 * src_stride);
2667 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2668 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2670 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2672 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2674 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2676 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2677 ST_SH2(dst0_r, dst0_l, dst, 8);
2679 ST_SH2(dst1_r, dst1_l, dst, 8);
2692 v16i8
src0,
src1, src2, src3, src4, src5;
2693 v16i8 src6, src7, src8, src9, src10, src11;
2694 v16i8 src10_r, src32_r, src76_r, src98_r;
2695 v16i8 src21_r, src43_r, src87_r, src109_r;
2696 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2697 v16i8 src10_l, src32_l, src21_l, src43_l;
2698 v8i16 dst0_l, dst1_l;
2700 v8i16 filter_vec, const_vec;
2703 const_vec = __msa_ldi_h(128);
2706 filter_vec =
LD_SH(filter);
2709 LD_SB3(src, src_stride, src0, src1, src2);
2711 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2712 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2714 LD_SB3(src + 16, src_stride, src6, src7, src8);
2715 src += (3 * src_stride);
2717 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2719 for (loop_cnt = (height >> 2); loop_cnt--;) {
2720 LD_SB2(src, src_stride, src3, src4);
2722 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2723 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2725 LD_SB2(src + 16, src_stride, src9, src10);
2726 src += (2 * src_stride);
2728 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2731 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2733 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2735 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2737 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2739 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
2741 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
2743 ST_SH2(dst0_r, dst0_l, dst, 8);
2744 ST_SH(dst2_r, dst + 16);
2746 ST_SH2(dst1_r, dst1_l, dst, 8);
2747 ST_SH(dst3_r, dst + 16);
2750 LD_SB2(src, src_stride, src5, src2);
2752 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2753 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2755 LD_SB2(src + 16, src_stride, src11, src8);
2756 src += (2 * src_stride);
2758 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2761 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2763 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2765 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2767 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2769 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
2771 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
2773 ST_SH2(dst0_r, dst0_l, dst, 8);
2774 ST_SH(dst2_r, dst + 16);
2776 ST_SH2(dst1_r, dst1_l, dst, 8);
2777 ST_SH(dst3_r, dst + 16);
2790 v16i8
src0,
src1, src2, src3, src4, src5;
2791 v16i8 src6, src7, src8, src9, src10, src11;
2792 v16i8 src10_r, src32_r, src76_r, src98_r;
2793 v16i8 src21_r, src43_r, src87_r, src109_r;
2794 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2795 v16i8 src10_l, src32_l, src76_l, src98_l;
2796 v16i8 src21_l, src43_l, src87_l, src109_l;
2797 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
2799 v8i16 filter_vec, const_vec;
2802 const_vec = __msa_ldi_h(128);
2805 filter_vec =
LD_SH(filter);
2808 LD_SB3(src, src_stride, src0, src1, src2);
2810 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2811 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2813 LD_SB3(src + 16, src_stride, src6, src7, src8);
2814 src += (3 * src_stride);
2816 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2817 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2819 for (loop_cnt = (height >> 2); loop_cnt--;) {
2820 LD_SB2(src, src_stride, src3, src4);
2822 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2823 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2825 LD_SB2(src + 16, src_stride, src9, src10);
2826 src += (2 * src_stride);
2828 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2829 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2832 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2834 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2836 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2838 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2840 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
2842 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
2844 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
2846 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
2848 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
2850 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
2853 LD_SB2(src, src_stride, src5, src2);
2855 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2856 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2858 LD_SB2(src + 16, src_stride, src11, src8);
2859 src += (2 * src_stride);
2861 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2862 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
2865 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2867 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2869 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2871 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2873 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
2875 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
2877 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
2879 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
2881 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
2883 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
2892 const int8_t *filter_x,
2893 const int8_t *filter_y)
2895 v16i8
src0,
src1, src2, src3, src4;
2897 v4i32 filt_h0, filt_h1;
2898 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2900 v8i16 filter_vec, const_vec;
2901 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2902 v8i16 dst0, dst1, dst2, dst3, dst4;
2903 v4i32 dst0_r, dst1_r;
2904 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2906 src -= (src_stride + 1);
2907 filter_vec =
LD_SH(filter_x);
2910 filter_vec =
LD_SH(filter_y);
2911 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2912 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2918 const_vec = __msa_ldi_h(128);
2921 LD_SB3(src, src_stride, src0, src1, src2);
2922 src += (3 * src_stride);
2925 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2926 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2927 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2935 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2937 LD_SB2(src, src_stride, src3, src4);
2940 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2944 dst32_r = __msa_ilvr_h(dst3, dst2);
2948 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2952 dst43_r = __msa_ilvr_h(dst4, dst3);
2956 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
2957 ST8x2_UB(dst0_r, dst, 2 * dst_stride);
2964 const int8_t *filter_x,
2965 const int8_t *filter_y)
2967 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2969 v4i32 filt_h0, filt_h1;
2970 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2972 v8i16 filter_vec, const_vec;
2973 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2974 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2975 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2976 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2978 src -= (src_stride + 1);
2980 filter_vec =
LD_SH(filter_x);
2983 filter_vec =
LD_SH(filter_y);
2984 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2985 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2991 const_vec = __msa_ldi_h(128);
2994 LD_SB3(src, src_stride, src0, src1, src2);
2995 src += (3 * src_stride);
2999 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3000 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3001 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3010 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3012 LD_SB4(src, src_stride, src3, src4, src5, src6);
3015 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3019 dst32_r = __msa_ilvr_h(dst3, dst2);
3023 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3027 dst43_r = __msa_ilvr_h(dst4, dst3);
3031 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3035 dst10_r = __msa_ilvr_h(dst5, dst4);
3039 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3043 dst21_r = __msa_ilvr_h(dst2, dst5);
3047 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3048 ST8x4_UB(dst0_r, dst1_r, dst, 2 * dst_stride);
3056 const int8_t *filter_x,
3057 const int8_t *filter_y,
3061 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3062 v16i8 src7, src8, src9, src10;
3064 v4i32 filt_h0, filt_h1;
3065 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3067 v8i16 filter_vec, const_vec;
3068 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3069 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3070 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3071 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3072 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3074 src -= (src_stride + 1);
3075 filter_vec =
LD_SH(filter_x);
3078 filter_vec =
LD_SH(filter_y);
3079 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3080 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3086 const_vec = __msa_ldi_h(128);
3089 LD_SB3(src, src_stride, src0, src1, src2);
3090 src += (3 * src_stride);
3093 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3094 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3095 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3104 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3106 for (loop_cnt = height >> 3; loop_cnt--;) {
3108 src3, src4, src5, src6, src7, src8, src9, src10);
3109 src += (8 * src_stride);
3112 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3116 dst32_r = __msa_ilvr_h(dst3, dst2);
3120 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3124 dst43_r = __msa_ilvr_h(dst4, dst3);
3128 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3132 dst54_r = __msa_ilvr_h(dst5, dst4);
3136 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3140 dst65_r = __msa_ilvr_h(dst6, dst5);
3144 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3148 dst76_r = __msa_ilvr_h(dst7, dst6);
3152 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3156 dst87_r = __msa_ilvr_h(dst8, dst7);
3160 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3164 dst10_r = __msa_ilvr_h(dst9, dst8);
3168 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3172 dst21_r = __msa_ilvr_h(dst2, dst9);
3177 dst5_r, dst4_r, dst7_r, dst6_r,
3178 dst0_r, dst1_r, dst2_r, dst3_r);
3179 ST8x8_UB(dst0_r, dst1_r, dst2_r, dst3_r, dst, 2 * dst_stride);
3180 dst += (8 * dst_stride);
3188 const int8_t *filter_x,
3189 const int8_t *filter_y,
3194 filter_x, filter_y);
3195 }
else if (4 == height) {
3197 filter_x, filter_y);
3198 }
else if (0 == (height % 8)) {
3200 filter_x, filter_y, height);
3208 const int8_t *filter_x,
3209 const int8_t *filter_y,
3213 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
3214 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
3215 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3217 v4i32 filt_h0, filt_h1;
3218 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3220 v8i16 filter_vec, const_vec;
3221 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3222 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3223 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3224 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3225 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3227 src -= (src_stride + 1);
3228 filter_vec =
LD_SH(filter_x);
3231 filter_vec =
LD_SH(filter_y);
3232 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3233 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3239 const_vec = __msa_ldi_h(128);
3242 LD_SB3(src, src_stride, src0, src1, src2);
3243 src += (3 * src_stride);
3246 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3247 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3248 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3260 for (loop_cnt = height >> 2; loop_cnt--;) {
3261 LD_SB4(src, src_stride, src3, src4, src5, src6);
3262 src += (4 * src_stride);
3265 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3275 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3285 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3295 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3306 dst2_l, dst2_r, dst3_l, dst3_r,
3307 dst0_r, dst1_r, dst2_r, dst3_r);
3309 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
3310 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
3311 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
3312 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
3314 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
3315 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
3316 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
3317 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
3320 SW(dst_val_int0, dst + 4);
3323 SW(dst_val_int1, dst + 4);
3326 SW(dst_val_int2, dst + 4);
3329 SW(dst_val_int3, dst + 4);
3339 const int8_t *filter_x,
3340 const int8_t *filter_y,
3343 v16i8
src0,
src1, src2, src3, src4;
3345 v4i32 filt_h0, filt_h1;
3346 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3348 v8i16 filter_vec, const_vec;
3349 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3350 v8i16 dst0, dst1, dst2, dst3, dst4;
3351 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3352 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3353 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3355 src -= (src_stride + 1);
3357 filter_vec =
LD_SH(filter_x);
3360 filter_vec =
LD_SH(filter_y);
3361 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3362 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3368 const_vec = __msa_ldi_h(128);
3371 LD_SB3(src, src_stride, src0, src1, src2);
3372 src += (3 * src_stride);
3375 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3376 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3377 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3389 LD_SB2(src, src_stride, src3, src4);
3390 src += (2 * src_stride);
3393 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3403 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3413 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3414 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3421 const int8_t *filter_x,
3422 const int8_t *filter_y,
3425 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3427 v4i32 filt_h0, filt_h1;
3428 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3430 v8i16 filter_vec, const_vec;
3431 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3432 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3433 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3434 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3435 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3436 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3437 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3438 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3440 src -= (src_stride + 1);
3442 filter_vec =
LD_SH(filter_x);
3445 filter_vec =
LD_SH(filter_y);
3446 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3447 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3453 const_vec = __msa_ldi_h(128);
3456 LD_SB3(src, src_stride, src0, src1, src2);
3457 src += (3 * src_stride);
3461 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3462 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3463 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3475 LD_SB2(src, src_stride, src3, src4);
3476 src += (2 * src_stride);
3481 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3494 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3504 LD_SB2(src, src_stride, src5, src6);
3505 src += (2 * src_stride);
3510 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3521 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3531 LD_SB2(src, src_stride, src7, src8);
3536 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3547 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3558 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3559 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3561 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3562 dst += (2 * dst_stride);
3563 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3564 dst += (2 * dst_stride);
3565 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3572 const int8_t *filter_x,
3573 const int8_t *filter_y,
3577 uint32_t loop_cnt, cnt;
3580 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3582 v4i32 filt_h0, filt_h1;
3583 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3585 v8i16 filter_vec, const_vec;
3586 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3587 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3588 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3589 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3590 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3592 src -= (src_stride + 1);
3594 filter_vec =
LD_SH(filter_x);
3597 filter_vec =
LD_SH(filter_y);
3598 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3599 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3605 const_vec = __msa_ldi_h(128);
3608 for (cnt = width >> 3; cnt--;) {
3612 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3613 src_tmp += (3 * src_stride);
3617 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3618 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3619 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631 for (loop_cnt = height >> 2; loop_cnt--;) {
3632 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3633 src_tmp += (4 * src_stride);
3636 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3648 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3659 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3671 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3683 dst2_l, dst2_r, dst3_l, dst3_r,
3684 dst0_r, dst1_r, dst2_r, dst3_r);
3686 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
3687 dst_tmp += (2 * dst_stride);
3688 ST_SW2(dst2_r, dst3_r, dst_tmp, dst_stride);
3689 dst_tmp += (2 * dst_stride);
3701 const int8_t *filter_x,
3702 const int8_t *filter_y,
3708 filter_x, filter_y, height);
3709 }
else if (6 == height) {
3711 filter_x, filter_y, height);
3712 }
else if (0 == (height % 4)) {
3714 filter_x, filter_y, height, 8);
3722 const int8_t *filter_x,
3723 const int8_t *filter_y,
3727 filter_x, filter_y, height, 8);
3730 filter_x, filter_y, height);
3738 const int8_t *filter_x,
3739 const int8_t *filter_y,
3743 filter_x, filter_y, height, 16);
3750 const int8_t *filter_x,
3751 const int8_t *filter_y,
3755 filter_x, filter_y, height, 24);
3762 const int8_t *filter_x,
3763 const int8_t *filter_y,
3767 filter_x, filter_y, height, 32);
3770 #define MC_COPY(WIDTH) \
3771 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
3773 ptrdiff_t src_stride, \
3779 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
3794 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3795 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
3797 ptrdiff_t src_stride, \
3803 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3805 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3806 MAX_PB_SIZE, filter, height); \
3809 MC(qpel,
h, 4, 8, hz, mx);
3810 MC(qpel,
h, 8, 8, hz, mx);
3811 MC(qpel,
h, 12, 8, hz, mx);
3812 MC(qpel,
h, 16, 8, hz, mx);
3813 MC(qpel,
h, 24, 8, hz, mx);
3814 MC(qpel,
h, 32, 8, hz, mx);
3815 MC(qpel,
h, 48, 8, hz, mx);
3816 MC(qpel,
h, 64, 8, hz, mx);
3818 MC(qpel, v, 4, 8, vt, my);
3819 MC(qpel, v, 8, 8, vt, my);
3820 MC(qpel, v, 12, 8, vt, my);
3821 MC(qpel, v, 16, 8, vt, my);
3822 MC(qpel, v, 24, 8, vt, my);
3823 MC(qpel, v, 32, 8, vt, my);
3824 MC(qpel, v, 48, 8, vt, my);
3825 MC(qpel, v, 64, 8, vt, my);
3827 MC(epel,
h, 4, 4, hz, mx);
3828 MC(epel,
h, 6, 4, hz, mx);
3829 MC(epel,
h, 8, 4, hz, mx);
3830 MC(epel,
h, 12, 4, hz, mx);
3831 MC(epel,
h, 16, 4, hz, mx);
3832 MC(epel,
h, 24, 4, hz, mx);
3833 MC(epel,
h, 32, 4, hz, mx);
3835 MC(epel, v, 4, 4, vt, my);
3836 MC(epel, v, 6, 4, vt, my);
3837 MC(epel, v, 8, 4, vt, my);
3838 MC(epel, v, 12, 4, vt, my);
3839 MC(epel, v, 16, 4, vt, my);
3840 MC(epel, v, 24, 4, vt, my);
3841 MC(epel, v, 32, 4, vt, my);
3845 #define MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3846 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
3848 ptrdiff_t src_stride, \
3854 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3855 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3857 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
3858 filter_x, filter_y, height); \
3861 MC_HV(qpel, hv, 4, 8, hv);
3862 MC_HV(qpel, hv, 8, 8, hv);
3863 MC_HV(qpel, hv, 12, 8, hv);
3864 MC_HV(qpel, hv, 16, 8, hv);
3865 MC_HV(qpel, hv, 24, 8, hv);
3866 MC_HV(qpel, hv, 32, 8, hv);
3867 MC_HV(qpel, hv, 48, 8, hv);
3868 MC_HV(qpel, hv, 64, 8, hv);
3870 MC_HV(epel, hv, 4, 4, hv);
3871 MC_HV(epel, hv, 6, 4, hv);
3872 MC_HV(epel, hv, 8, 4, hv);
3873 MC_HV(epel, hv, 12, 4, hv);
3874 MC_HV(epel, hv, 16, 4, hv);
3875 MC_HV(epel, hv, 24, 4, hv);
3876 MC_HV(epel, hv, 32, 4, hv);
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_4x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SB(...)
static void hevc_hv_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height, int32_t width)
static void hevc_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_8x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x2_UB(in, pdst, stride)
static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B4_128_SB(...)
static void hevc_vt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_16multx4mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define DPADD_SB2_SH(...)
#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_hv_4t_4x4_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W4_SW(...)
static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_4x4_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
#define ST_SW2(in0, in1, pdst, stride)
static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_4t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_4t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void hevc_hz_4t_8x2multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_4t_8x6_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SW(...)
static void hevc_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_4t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)