36 src += (4 * src_stride);
37 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
38 ref += (4 * ref_stride);
50 diff = __msa_asub_u_b((v16u8)
src0, (v16u8) ref0);
52 sad += __msa_hadd_u_h((v16u8)
diff, (v16u8)
diff);
69 src += (2 * src_stride);
71 ref += (2 * ref_stride);
75 src += (2 * src_stride);
77 ref += (2 * ref_stride);
80 for (; res > 0; res--) {
86 diff = __msa_asub_u_b((v16u8)
src0, (v16u8) ref0);
87 sad += __msa_hadd_u_h((v16u8)
diff, (v16u8)
diff);
101 v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
107 src += (4 * src_stride);
108 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
109 ref += (4 * ref_stride);
113 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
114 ref0, ref1, ref2, ref3);
120 src += (4 * src_stride);
121 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
122 ref += (4 * ref_stride);
126 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
127 ref0, ref1, ref2, ref3);
140 comp0 = (v16u8)__msa_aver_u_b((v16u8) ref0, (v16u8) ref1);
141 diff = __msa_asub_u_b((v16u8)
src0, (v16u8) comp0);
143 sad += __msa_hadd_u_h((v16u8)
diff, (v16u8)
diff);
157 v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
162 src += (4 * src_stride);
163 LD_UB4(
ref, ref_stride, ref00, ref10, ref20, ref30);
164 LD_UB4(
ref + 1, ref_stride, ref01, ref11, ref21, ref31);
165 ref += (4 * ref_stride);
167 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
169 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
173 src += (4 * src_stride);
174 LD_UB4(
ref, ref_stride, ref00, ref10, ref20, ref30);
175 LD_UB4(
ref + 1, ref_stride, ref01, ref11, ref21, ref31);
176 ref += (4 * ref_stride);
178 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
180 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
191 comp0 = (v16u8)__msa_aver_u_b((v16u8) ref00, (v16u8) ref01);
192 diff = __msa_asub_u_b((v16u8)
src0, (v16u8) comp0);
193 sad += __msa_hadd_u_h((v16u8)
diff, (v16u8)
diff);
207 v16u8 ref0, ref1, ref2, ref3, ref4;
213 src += (4 * src_stride);
214 LD_UB5(
ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
215 ref += (4 * ref_stride);
224 src += (4 * src_stride);
225 LD_UB5(
ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
226 ref += (4 * ref_stride);
241 comp0 = (v16u8)__msa_aver_u_b((v16u8) ref0, (v16u8) ref1);
242 diff = __msa_asub_u_b((v16u8)
src0, (v16u8) comp0);
244 sad += __msa_hadd_u_h((v16u8)
diff, (v16u8)
diff);
258 v16u8 ref0, ref1, ref2, ref3, ref4;
262 LD_UB5(
ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
263 ref += (5 * ref_stride);
265 src += (4 * src_stride);
274 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
275 ref += (3 * ref_stride);
277 src += (4 * src_stride);
291 comp0 = (v16u8)__msa_aver_u_b((v16u8) ref0, (v16u8) ref1);
292 diff = __msa_asub_u_b((v16u8)
src0, (v16u8) comp0);
293 sad += __msa_hadd_u_h((v16u8)
diff, (v16u8)
diff);
307 v16u8 ref0, ref1, ref2, ref3, ref4;
308 v16i8
mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
309 v8u16 comp0, comp1, comp2, comp3;
313 for (ht_cnt = (
height >> 2); ht_cnt--;) {
314 LD_UB5(
ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
315 ref += (4 * ref_stride);
317 src += (4 * src_stride);
322 comp0 = __msa_hadd_u_h(temp0, temp0);
323 comp1 = __msa_hadd_u_h(temp1, temp1);
325 comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
326 comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
328 temp0 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref1, (v16i8) ref1);
329 comp2 = __msa_hadd_u_h(temp0, temp0);
331 comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
332 comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
333 comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
334 diff = (v16u8) __msa_asub_u_b(
src0, (v16u8) comp1);
337 temp1 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref2, (v16i8) ref2);
338 comp3 = __msa_hadd_u_h(temp1, temp1);
340 comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
341 comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
343 temp0 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref3, (v16i8) ref3);
344 comp0 = __msa_hadd_u_h(temp0, temp0);
346 comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
347 comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
348 comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
349 diff = (v16u8) __msa_asub_u_b(
src1, (v16u8) comp3);
356 temp0 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref0, (v16i8) ref0);
357 temp1 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref1, (v16i8) ref1);
360 comp0 = __msa_hadd_u_h(temp0, temp0);
361 comp2 = __msa_hadd_u_h(temp1, temp1);
363 comp2 = (v8u16)__msa_srari_h((v8i16) comp2, 2);
364 comp0 = (v16u8) __msa_pckev_b((v16i8)
zero, (v16i8) comp2);
381 v16u8 temp0, temp1, temp2, temp3;
382 v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
383 v8u16 comp0, comp1, comp2, comp3;
388 src += (4 * src_stride);
389 LD_UB5(
ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
390 LD_UB5(
ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
391 ref += (5 * ref_stride);
394 comp0 = __msa_hadd_u_h(temp0, temp0);
395 comp1 = __msa_hadd_u_h(temp1, temp1);
397 comp2 = __msa_hadd_u_h(temp2, temp2);
398 comp3 = __msa_hadd_u_h(temp3, temp3);
402 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
407 comp0 = __msa_hadd_u_h(temp0, temp0);
408 comp1 = __msa_hadd_u_h(temp1, temp1);
412 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
417 comp2 = __msa_hadd_u_h(temp2, temp2);
418 comp3 = __msa_hadd_u_h(temp3, temp3);
422 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
427 comp0 = __msa_hadd_u_h(temp0, temp0);
428 comp1 = __msa_hadd_u_h(temp1, temp1);
432 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
437 src += (4 * src_stride);
438 LD_UB4(
ref, ref_stride, ref00, ref01, ref02, ref03);
439 LD_UB4(
ref + 1, ref_stride, ref10, ref11, ref12, ref13);
440 ref += (3 * ref_stride);
443 comp2 = __msa_hadd_u_h(temp2, temp2);
444 comp3 = __msa_hadd_u_h(temp3, temp3);
448 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
453 comp0 = __msa_hadd_u_h(temp0, temp0);
454 comp1 = __msa_hadd_u_h(temp1, temp1);
458 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
463 comp2 = __msa_hadd_u_h(temp2, temp2);
464 comp3 = __msa_hadd_u_h(temp3, temp3);
468 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
473 comp0 = __msa_hadd_u_h(temp0, temp0);
474 comp1 = __msa_hadd_u_h(temp1, temp1);
478 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
485 LD_UB2(
ref + 1, ref_stride, ref01, ref11);
490 comp0 = __msa_hadd_u_h(temp0, temp0);
491 comp1 = __msa_hadd_u_h(temp1, temp1);
492 comp2 = __msa_hadd_u_h(temp2, temp2);
493 comp3 = __msa_hadd_u_h(temp3, temp3);
497 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
505 #define CALC_MSE_B(src, ref, var) \
507 v16u8 src_l0_m, src_l1_m; \
508 v8i16 res_l0_m, res_l1_m; \
510 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
511 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
512 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
516 const uint8_t *ref_ptr,
int32_t ref_stride,
523 uint32_t ref0, ref1, ref2, ref3;
531 src_ptr += (4 * src_stride);
532 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
533 ref_ptr += (4 * ref_stride);
545 src_ptr += src_stride;
546 ref_ptr += ref_stride;
547 src = (v16u8)__msa_insert_w((v4i32)
src, 0,
src0);
548 ref = (v16u8)__msa_insert_w((v4i32)
ref, 0, ref0);
549 reg0 = (v16u8)__msa_ilvr_b(
src,
ref);
550 reg0 = (v16u8)__msa_ilvr_d((v2i64)
zero, (v2i64) reg0);
551 tmp0 = (v8i16)__msa_hsub_u_h((v16u8) reg0, (v16u8) reg0);
552 var = (v4i32)__msa_dpadd_s_w((v4i32) var, (v8i16) tmp0, (v8i16) tmp0);
560 const uint8_t *ref_ptr,
int32_t ref_stride,
567 v16u8 ref0, ref1, ref2, ref3;
572 src_ptr += (4 * src_stride);
573 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
574 ref_ptr += (4 * ref_stride);
585 ref0 =
LD_UB(ref_ptr);
586 src_ptr += src_stride;
587 ref_ptr += ref_stride;
588 ref1 = (v16u8)__msa_ilvr_b(
src0, ref0);
589 tmp0 = (v8i16)__msa_hsub_u_h((v16u8) ref1, (v16u8) ref1);
590 var = (v4i32)__msa_dpadd_s_w((v4i32) var, (v8i16) tmp0, (v8i16) tmp0);
598 const uint8_t *ref_ptr,
int32_t ref_stride,
609 src_ptr += src_stride;
611 ref_ptr += ref_stride;
615 src_ptr += src_stride;
617 ref_ptr += ref_stride;
621 src_ptr += src_stride;
623 ref_ptr += ref_stride;
627 src_ptr += src_stride;
629 ref_ptr += ref_stride;
635 src_ptr += src_stride;
637 ref_ptr += ref_stride;
650 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
651 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
652 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
657 LD_UB8(
ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
659 src4, ref4, src5, ref5, src6, ref6, src7, ref7,
660 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
661 HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
662 HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
664 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
665 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
666 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
667 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
668 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
669 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
670 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
672 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
673 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
674 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
675 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
676 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
677 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
678 diff0, diff1, diff2, diff3);
679 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
680 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
681 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
682 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
683 sum += __msa_add_a_h((v8i16) diff0,
zero);
684 sum += __msa_add_a_h((v8i16) diff1,
zero);
685 sum += __msa_add_a_h((v8i16) diff2,
zero);
686 sum += __msa_add_a_h((v8i16) diff3,
zero);
692 const uint8_t *dumy,
int32_t ref_stride)
696 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
697 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
706 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
707 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
708 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
709 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
710 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
711 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
712 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
714 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
715 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
716 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
717 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
718 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
719 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
720 diff0, diff1, diff2, diff3);
721 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
722 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
723 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
724 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
725 sum += __msa_add_a_h((v8i16) diff0, (v8i16)
zero);
726 sum += __msa_add_a_h((v8i16) diff1, (v8i16)
zero);
727 sum += __msa_add_a_h((v8i16) diff2, (v8i16)
zero);
728 sum += __msa_add_a_h((v8i16) diff3, (v8i16)
zero);
730 sum_res -=
abs(temp0[0] + temp4[0]);
814 #define WRAPPER8_16_SQ(name8, name16) \
815 int name16(MpegEncContext *s, const uint8_t *dst, const uint8_t *src, \
816 ptrdiff_t stride, int h) \
819 score += name8(s, dst, src, stride, 8); \
820 score += name8(s, dst + 8, src + 8, stride, 8); \
824 score +=name8(s, dst, src, stride, 8); \
825 score +=name8(s, dst + 8, src + 8, stride, 8); \