27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   28     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
 
   31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \ 
   34     v4i32 out0_r, out1_r, out0_l, out1_l;                           \ 
   36     ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);               \ 
   37     ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);               \ 
   39     out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);  \ 
   40     out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);  \ 
   41     out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);  \ 
   42     out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);  \ 
   44     SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                \ 
   45     CLIP_SW4_0_255(out0_l, out0_r, out1_l, out1_r);                 \ 
   46     PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);        \ 
   49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,       \ 
   50                            wgt, rnd, offset, out0, out1, out2, out3)         \ 
   52     HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1);  \ 
   53     HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3);  \ 
   56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd,  \ 
   59     v4i32 out0_r, out1_r, out0_l, out1_l;                            \ 
   61     ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);                \ 
   62     ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);                \ 
   63     out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);   \ 
   64     out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);   \ 
   65     out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);   \ 
   66     out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);   \ 
   67     SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \ 
   68     CLIP_SW4_0_255(out0_r, out1_r, out0_l, out1_l);                  \ 
   69     PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);         \ 
   72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,  \ 
   73                                     vec3, wgt, rnd, offset, out0, out1,    \ 
   76     HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset,    \ 
   78     HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset,    \ 
   84                                    const int16_t *src1_ptr,
 
   95     uint32_t loop_cnt, tp0, tp1, tp2, tp3;
 
   96     uint64_t tpd0, tpd1, tpd2, tpd3;
 
  101     v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
 
  102     v8i16 dst0, dst1, dst2, dst3, weight_vec;
 
  103     v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
 
  105     offset = (offset0 + offset1) << rnd_val;
 
  106     weight0 = weight0 & 0x0000FFFF;
 
  107     weight = weight0 | (weight1 << 16);
 
  109     offset_vec = __msa_fill_w(
offset);
 
  110     weight_vec = (v8i16) __msa_fill_w(
weight);
 
  111     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  114         LW2(src0_ptr, src_stride, tp0, tp1);
 
  116         LD2(src1_ptr, src2_stride, tpd0, tpd1);
 
  119         dst0 = (v8i16) __msa_ilvr_b(
zero, 
src0);
 
  123         dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
 
  124         dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
 
  127         dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
  128         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
 
  131         LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  133         LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
 
  139                                     offset_vec, dst0, dst1);
 
  140         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
  141         ST_W4(out0, 0, 1, 2, 3, 
dst, dst_stride);
 
  142     } 
else if (0 == 
height % 8) {
 
  143         for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
  144             LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  145             src0_ptr += 4 * src_stride;
 
  147             LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  148             src0_ptr += 4 * src_stride;
 
  150             LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
 
  151             src1_ptr += (4 * src2_stride);
 
  154             LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
 
  155             src1_ptr += (4 * src2_stride);
 
  160             SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  162                                         in3, weight_vec, rnd_vec, offset_vec,
 
  163                                         dst0, dst1, dst2, dst3);
 
  165             ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
  166             dst += (8 * dst_stride);
 
  173                                    const int16_t *src1_ptr,
 
  187     uint64_t tp0, tp1, tp2, tp3;
 
  191     v8i16 in0, in1, in2, in3;
 
  192     v8i16 dst0, dst1, dst2, dst3;
 
  193     v4i32 offset_vec, weight_vec, rnd_vec;
 
  195     offset = (offset0 + offset1) << rnd_val;
 
  196     weight0 = weight0 & 0x0000FFFF;
 
  197     weight = weight0 | (weight1 << 16);
 
  199     weight_vec = __msa_fill_w(
weight);
 
  200     offset_vec = __msa_fill_w(
offset);
 
  201     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  203     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  204         LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  205         src0_ptr += (4 * src_stride);
 
  208         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  209         src1_ptr += (4 * src2_stride);
 
  212         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  215                                     weight_vec, rnd_vec, offset_vec,
 
  216                                     dst0, dst1, dst2, dst3);
 
  219         ST_H2(out0, 2, 6, 
dst + 4, dst_stride);
 
  220         ST_W2(out1, 0, 2, 
dst + 2 * dst_stride, dst_stride);
 
  221         ST_H2(out1, 2, 6, 
dst + 2 * dst_stride + 4, dst_stride);
 
  222         dst += (4 * dst_stride);
 
  225         LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  226         src0_ptr += (4 * src_stride);
 
  229         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  230         src1_ptr += (4 * src2_stride);
 
  233         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  236                                     weight_vec, rnd_vec, offset_vec,
 
  237                                     dst0, dst1, dst2, dst3);
 
  241         ST_H2(out0, 2, 6, 
dst + 4, dst_stride);
 
  242         ST_W2(out1, 0, 2, 
dst + 2 * dst_stride, dst_stride);
 
  243         ST_H2(out1, 2, 6, 
dst + 2 * dst_stride + 4, dst_stride);
 
  249                                    const int16_t *src1_ptr,
 
  260     uint64_t tp0, tp1, tp2, tp3;
 
  262     v16u8 out0, out1, out2;
 
  265     v8i16 in0, in1, in2, in3, in4, in5;
 
  266     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
  267     v4i32 offset_vec, weight_vec, rnd_vec;
 
  269     offset = (offset0 + offset1) << rnd_val;
 
  270     weight0 = weight0 & 0x0000FFFF;
 
  271     weight = weight0 | (weight1 << 16);
 
  273     offset_vec = __msa_fill_w(
offset);
 
  274     weight_vec = __msa_fill_w(
weight);
 
  275     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  278         LD2(src0_ptr, src_stride, tp0, tp1);
 
  280         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
  285                            weight_vec, rnd_vec, offset_vec,
 
  288         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
  291         LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  292         src0_ptr += 4 * src_stride;
 
  295         LD2(src0_ptr, src_stride, tp0, tp1);
 
  300         LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
 
  301         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  304                                     weight_vec, rnd_vec, offset_vec, dst0, dst1,
 
  307                                     offset_vec, dst4, dst5);
 
  308         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  309         ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
  310         ST_D2(out2, 0, 1, 
dst + 4 * dst_stride, dst_stride);
 
  311     } 
else if (0 == 
height % 4) {
 
  314         for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  315             LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
 
  316             src0_ptr += (4 * src_stride);
 
  321             LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  322             src1_ptr += (4 * src2_stride);
 
  324             SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  326                                         in3, weight_vec, rnd_vec, offset_vec,
 
  327                                         dst0, dst1, dst2, dst3);
 
  329             ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
  330             dst += (4 * dst_stride);
 
  337                                     const int16_t *src1_ptr,
 
  351     v16u8 out0, out1, out2;
 
  353     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
  354     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
  355     v4i32 offset_vec, weight_vec, rnd_vec;
 
  357     offset = (offset0 + offset1) << rnd_val;
 
  358     weight0 = weight0 & 0x0000FFFF;
 
  359     weight = weight0 | (weight1 << 16);
 
  361     offset_vec = __msa_fill_w(
offset);
 
  362     weight_vec = __msa_fill_w(
weight);
 
  363     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  365     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  367         src0_ptr += (4 * src_stride);
 
  368         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  369         LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
 
  370         src1_ptr += (4 * src2_stride);
 
  374                    dst0, dst1, dst2, dst3);
 
  376         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  383                                     weight_vec, rnd_vec, offset_vec, dst0, dst1,
 
  386                                     offset_vec, dst4, dst5);
 
  387         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  388         ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
  389         ST_W4(out2, 0, 1, 2, 3, 
dst + 8, dst_stride);
 
  390         dst += (4 * dst_stride);
 
  396                                     const int16_t *src1_ptr,
 
  409     v16u8 out0, out1, out2, out3;
 
  412     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
  413     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  414     v4i32 offset_vec, weight_vec, rnd_vec;
 
  416     offset = (offset0 + offset1) << rnd_val;
 
  417     weight0 = weight0 & 0x0000FFFF;
 
  418     weight = weight0 | (weight1 << 16);
 
  420     offset_vec = __msa_fill_w(
offset);
 
  421     weight_vec = __msa_fill_w(
weight);
 
  422     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  424     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  426         src0_ptr += (4 * src_stride);
 
  427         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  428         LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
 
  429         src1_ptr += (4 * src2_stride);
 
  430         ILVR_B4_SH(
zero, 
src0, 
zero, 
src1, 
zero, 
src2, 
zero, src3, tmp0, tmp1,
 
  432         ILVL_B4_SH(
zero, 
src0, 
zero, 
src1, 
zero, 
src2, 
zero, src3, tmp4, tmp5,
 
  434         SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
 
  435         SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
 
  437                                     weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
 
  440                                     weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
 
  444         ST_UB4(out0, out1, out2, out3, 
dst, dst_stride);
 
  445         dst += (4 * dst_stride);
 
  451                                     const int16_t *src1_ptr,
 
  464     v16u8 out0, out1, out2, out3, out4, out5;
 
  466     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
 
  467     v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
 
  468     v4i32 offset_vec, weight_vec, rnd_vec;
 
  470     offset = (offset0 + offset1) << rnd_val;
 
  471     weight0 = weight0 & 0x0000FFFF;
 
  472     weight = weight0 | (weight1 << 16);
 
  474     offset_vec = __msa_fill_w(
offset);
 
  475     weight_vec = __msa_fill_w(
weight);
 
  476     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  478     for (loop_cnt = 8; loop_cnt--;) {
 
  480         LD_SB4(src0_ptr + 16, src_stride, 
src2, src3, src6, src7);
 
  481         src0_ptr += (4 * src_stride);
 
  482         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  483         LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
 
  484         LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
 
  485         src1_ptr += (4 * src2_stride);
 
  493         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  494         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  495         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  497                                     weight_vec, rnd_vec, offset_vec, dst0, dst1,
 
  500                                     weight_vec, rnd_vec, offset_vec, dst4, dst5,
 
  503                                     in11, weight_vec, rnd_vec, offset_vec,
 
  504                                     dst8, dst9, dst10, dst11);
 
  505         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  506         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
 
  507         ST_UB4(out0, out1, out3, out4, 
dst, dst_stride);
 
  508         ST_D4(out2, out5, 0, 1, 0, 1, 
dst + 16, dst_stride);
 
  509         dst += (4 * dst_stride);
 
  515                                     const int16_t *src1_ptr,
 
  528     v16u8 out0, out1, out2, out3;
 
  531     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
  532     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  533     v4i32 offset_vec, weight_vec, rnd_vec;
 
  535     offset = (offset0 + offset1) << rnd_val;
 
  536     weight0 = weight0 & 0x0000FFFF;
 
  537     weight = weight0 | (weight1 << 16);
 
  539     offset_vec = __msa_fill_w(
offset);
 
  540     weight_vec = __msa_fill_w(
weight);
 
  541     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  543     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
  545         src0_ptr += src_stride;
 
  547         src0_ptr += src_stride;
 
  548         LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
 
  549         src1_ptr += src2_stride;
 
  550         LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
 
  551         src1_ptr += src2_stride;
 
  557         SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
 
  558         SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
 
  560                                     weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
 
  563                                     weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
 
  576                                     const int16_t *src1_ptr,
 
  589     v16u8 out0, out1, out2;
 
  592     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
 
  593     v4i32 offset_vec, weight_vec, rnd_vec;
 
  595     offset = (offset0 + offset1) << rnd_val;
 
  596     weight0 = weight0 & 0x0000FFFF;
 
  597     weight = weight0 | (weight1 << 16);
 
  599     offset_vec = __msa_fill_w(
offset);
 
  600     weight_vec = __msa_fill_w(
weight);
 
  601     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  603     for (loop_cnt = 64; loop_cnt--;) {
 
  605         src0_ptr += src_stride;
 
  606         LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
 
  607         src1_ptr += src2_stride;
 
  612         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  615                                     weight_vec, rnd_vec, offset_vec, dst0, dst1,
 
  618                                     offset_vec, dst4, dst5);
 
  619         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  628                                     const int16_t *src1_ptr,
 
  641     v16u8 out0, out1, out2, out3;
 
  644     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
  645     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  646     v4i32 offset_vec, weight_vec, rnd_vec;
 
  648     offset = (offset0 + offset1) << rnd_val;
 
  649     weight0 = weight0 & 0x0000FFFF;
 
  650     weight = weight0 | (weight1 << 16);
 
  652     offset_vec = __msa_fill_w(
offset);
 
  653     weight_vec = __msa_fill_w(
weight);
 
  654     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  656     for (loop_cnt = 
height; loop_cnt--;) {
 
  658         src0_ptr += src_stride;
 
  659         LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 
  660         src1_ptr += src2_stride;
 
  662         ILVR_B4_SH(
zero, 
src0, 
zero, 
src1, 
zero, 
src2, 
zero, src3, tmp0, tmp1,
 
  664         ILVL_B4_SH(
zero, 
src0, 
zero, 
src1, 
zero, 
src2, 
zero, src3, tmp4, tmp5,
 
  666         SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
 
  667         SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
 
  669                                     weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
 
  672                                     weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
 
  683                                     const int16_t *src1_ptr,
 
  697     v8i16 filt0, filt1, filt2, filt3;
 
  699     v16i8 mask1, mask2, mask3;
 
  700     v16i8 vec0, vec1, vec2, vec3;
 
  702     v8i16 in0, in1, in2, in3;
 
  703     v8i16 filter_vec, out0, out1;
 
  704     v4i32 weight_vec, offset_vec, rnd_vec;
 
  709     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  715     offset = (offset0 + offset1) << rnd_val;
 
  716     weight0 = weight0 & 0x0000FFFF;
 
  717     weight = weight0 | (weight1 << 16);
 
  718     constant = 128 * weight1;
 
  722     offset_vec = __msa_fill_w(
offset);
 
  723     weight_vec = __msa_fill_w(
weight);
 
  724     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  726     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  728         src0_ptr += (4 * src_stride);
 
  729         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  730         src1_ptr += (4 * src2_stride);
 
  735                    vec0, vec1, vec2, vec3);
 
  739                    vec0, vec1, vec2, vec3);
 
  744                            weight_vec, rnd_vec, offset_vec,
 
  747         out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
 
  748         ST_W4(out0, 0, 1, 2, 3, 
dst, dst_stride);
 
  749         dst += (4 * dst_stride);
 
  755                                     const int16_t *src1_ptr,
 
  769     v8i16 filt0, filt1, filt2, filt3;
 
  771     v16i8 mask1, mask2, mask3;
 
  772     v16i8 vec0, vec1, vec2, vec3;
 
  773     v8i16 dst0, dst1, dst2, dst3;
 
  774     v8i16 in0, in1, in2, in3;
 
  775     v8i16 filter_vec, out0, out1, out2, out3;
 
  776     v4i32 weight_vec, offset_vec, rnd_vec;
 
  780     offset = (offset0 + offset1) << rnd_val;
 
  781     weight0 = weight0 & 0x0000FFFF;
 
  782     weight = weight0 | (weight1 << 16);
 
  783     constant = 128 * weight1;
 
  787     offset_vec = __msa_fill_w(
offset);
 
  788     weight_vec = __msa_fill_w(
weight);
 
  789     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  792     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  798     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  800         src0_ptr += (4 * src_stride);
 
  801         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  802         src1_ptr += (4 * src2_stride);
 
  806                    vec0, vec1, vec2, vec3);
 
  810                    vec0, vec1, vec2, vec3);
 
  814                    vec0, vec1, vec2, vec3);
 
  817         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  818                    vec0, vec1, vec2, vec3);
 
  824                            weight_vec, rnd_vec, offset_vec,
 
  825                            out0, out1, out2, out3);
 
  828         ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
  829         dst += (4 * dst_stride);
 
  835                                      const int16_t *src1_ptr,
 
  850     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
  851     v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
 
  852     v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
 
  853     v4i32 weight_vec, offset_vec, rnd_vec;
 
  857     weight0 = weight0 & 0x0000FFFF;
 
  858     weight = weight0 | (weight1 << 16);
 
  859     constant = 128 * weight1;
 
  861     offset = (offset0 + offset1) << rnd_val;
 
  864     offset_vec = __msa_fill_w(
offset);
 
  865     weight_vec = __msa_fill_w(
weight);
 
  866     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  869     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  880     for (loop_cnt = 4; loop_cnt--;) {
 
  882         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
  896         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
 
  901                            weight_vec, rnd_vec, offset_vec, out0, out1, out2,
 
  904         ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
  907         src0_ptr += (4 * src_stride);
 
  908         LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
 
  909         src1_ptr += (4 * src2_stride);
 
  916         VSHF_B4_SB(
src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
 
  921                            offset_vec, out0, out1);
 
  922         out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
 
  923         ST_W4(out0, 0, 1, 2, 3, 
dst + 8, dst_stride);
 
  924         dst += (4 * dst_stride);
 
  930                                      const int16_t *src1_ptr,
 
  945     v8i16 in0, in1, in2, in3;
 
  946     v8i16 filt0, filt1, filt2, filt3;
 
  947     v16i8 mask1, mask2, mask3;
 
  948     v8i16 filter_vec, out0, out1, out2, out3;
 
  949     v16i8 vec0, vec1, vec2, vec3;
 
  950     v8i16 dst0, dst1, dst2, dst3;
 
  951     v4i32 weight_vec, offset_vec, rnd_vec;
 
  952     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
  955     offset = (offset0 + offset1) << rnd_val;
 
  956     weight0 = weight0 & 0x0000FFFF;
 
  957     weight = weight0 | (weight1 << 16);
 
  958     constant = 128 * weight1;
 
  962     offset_vec = __msa_fill_w(
offset);
 
  963     weight_vec = __msa_fill_w(
weight);
 
  964     rnd_vec = __msa_fill_w(rnd_val + 1);
 
  967     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  973     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
  975         src0_ptr += src_stride;
 
  977         src0_ptr += src_stride;
 
  978         LD_SH2(src1_ptr, 8, in0, in1);
 
  979         src1_ptr += src2_stride;
 
  980         LD_SH2(src1_ptr, 8, in2, in3);
 
  981         src1_ptr += src2_stride;
 
  985                    vec0, vec1, vec2, vec3);
 
  989                    vec0, vec1, vec2, vec3);
 
  993                    vec0, vec1, vec2, vec3);
 
  996         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  997                    vec0, vec1, vec2, vec3);
 
 1003                            weight_vec, rnd_vec, offset_vec,
 
 1004                            out0, out1, out2, out3);
 
 1008         dst += (2 * dst_stride);
 
 1014                                      const int16_t *src1_ptr,
 
 1030     v8i16 in0, in1, in2;
 
 1031     v8i16 filt0, filt1, filt2, filt3;
 
 1032     v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1033     v16i8 vec0, vec1, vec2, vec3;
 
 1034     v8i16 dst0, dst1, dst2;
 
 1035     v4i32 dst2_r, dst2_l;
 
 1036     v8i16 filter_vec, out0, out1, out2;
 
 1037     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1040     src0_ptr = src0_ptr - 3;
 
 1041     offset = (offset0 + offset1) << rnd_val;
 
 1042     weight0 = weight0 & 0x0000FFFF;
 
 1043     weight = weight0 | (weight1 << 16);
 
 1044     constant = 128 * weight1;
 
 1048     offset_vec = __msa_fill_w(
offset);
 
 1049     weight_vec = __msa_fill_w(
weight);
 
 1050     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1053     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1064     src0_ptr += src_stride;
 
 1065     LD_SH2(src1_ptr, 8, in0, in1);
 
 1066     in2 = 
LD_SH(src1_ptr + 16);
 
 1067     src1_ptr += src2_stride;
 
 1070     for (loop_cnt = 31; loop_cnt--;) {
 
 1072                    vec0, vec1, vec2, vec3);
 
 1076                    vec0, vec1, vec2, vec3);
 
 1080                    vec0, vec1, vec2, vec3);
 
 1085                            weight_vec, rnd_vec, offset_vec,
 
 1089         dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
 
 1090                                  (v8i16) weight_vec);
 
 1091         dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
 
 1092                                  (v8i16) weight_vec);
 
 1095         out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
 
 1098         src0_ptr += src_stride;
 
 1099         LD_SH2(src1_ptr, 8, in0, in1);
 
 1100         in2 = 
LD_SH(src1_ptr + 16);
 
 1101         src1_ptr += src2_stride;
 
 1104         dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
 
 1106         SD(dst_val0, 
dst + 16);
 
 1122     dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
 
 1123     dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
 
 1126     out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
 
 1128     dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
 
 1130     SD(dst_val0, 
dst + 16);
 
 1136                                      const int16_t *src1_ptr,
 
 1151     v8i16 in0, in1, in2, in3;
 
 1152     v8i16 filt0, filt1, filt2, filt3;
 
 1154     v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1155     v16i8 vec0, vec1, vec2, vec3;
 
 1156     v8i16 dst0, dst1, dst2, dst3;
 
 1157     v8i16 filter_vec, out0, out1, out2, out3;
 
 1158     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1161     offset = (offset0 + offset1) << rnd_val;
 
 1162     weight0 = weight0 & 0x0000FFFF;
 
 1163     weight = weight0 | (weight1 << 16);
 
 1164     constant = 128 * weight1;
 
 1168     offset_vec = __msa_fill_w(
offset);
 
 1169     weight_vec = __msa_fill_w(
weight);
 
 1170     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1173     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1183     for (loop_cnt = 
height; loop_cnt--;) {
 
 1186         src0_ptr += src_stride;
 
 1187         LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
 
 1188         src1_ptr += src2_stride;
 
 1193                    vec0, vec1, vec2, vec3);
 
 1197                    vec0, vec1, vec2, vec3);
 
 1201                    vec0, vec1, vec2, vec3);
 
 1205                    vec0, vec1, vec2, vec3);
 
 1211                            weight_vec, rnd_vec, offset_vec,
 
 1212                            out0, out1, out2, out3);
 
 1222                                      const int16_t *src1_ptr,
 
 1237     v8i16 in0, in1, in2, in3;
 
 1238     v8i16 filt0, filt1, filt2, filt3;
 
 1240     v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1241     v16i8 vec0, vec1, vec2, vec3;
 
 1242     v8i16 dst0, dst1, dst2, dst3;
 
 1243     v8i16 filter_vec, out0, out1, out2, out3;
 
 1244     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1247     offset = (offset0 + offset1) << rnd_val;
 
 1248     weight0 = weight0 & 0x0000FFFF;
 
 1249     weight = weight0 | (weight1 << 16);
 
 1250     constant = 128 * weight1;
 
 1254     offset_vec = __msa_fill_w(
offset);
 
 1255     weight_vec = __msa_fill_w(
weight);
 
 1256     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1259     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1269     for (loop_cnt = 64; loop_cnt--;) {
 
 1272         LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
 
 1274         LD_SB2(src0_ptr + 32, 8, src3, src4);
 
 1275         src0_ptr += src_stride;
 
 1279                    vec0, vec1, vec2, vec3);
 
 1283                    vec0, vec1, vec2, vec3);
 
 1287                    vec0, vec1, vec2, vec3);
 
 1291                    vec0, vec1, vec2, vec3);
 
 1296                            weight_vec, rnd_vec, offset_vec,
 
 1297                            out0, out1, out2, out3);
 
 1302         LD_SH2(src1_ptr + 32, 8, in2, in3);
 
 1303         src1_ptr += src2_stride;
 
 1305         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1306                    vec0, vec1, vec2, vec3);
 
 1309         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 1310                    vec0, vec1, vec2, vec3);
 
 1315                            weight_vec, rnd_vec, offset_vec,
 
 1318         out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
 
 1326                                      const int16_t *src1_ptr,
 
 1338     const uint8_t *src0_ptr_tmp;
 
 1340     const int16_t *src1_ptr_tmp;
 
 1341     uint32_t loop_cnt, cnt;
 
 1344     v8i16 in0, in1, in2, in3;
 
 1345     v8i16 filt0, filt1, filt2, filt3;
 
 1347     v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1348     v16i8 vec0, vec1, vec2, vec3;
 
 1349     v8i16 dst0, dst1, dst2, dst3;
 
 1350     v8i16 filter_vec, out0, out1, out2, out3;
 
 1351     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1354     offset = (offset0 + offset1) << rnd_val;
 
 1355     weight0 = weight0 & 0x0000FFFF;
 
 1356     weight = weight0 | (weight1 << 16);
 
 1357     constant = 128 * weight1;
 
 1361     offset_vec = __msa_fill_w(
offset);
 
 1362     weight_vec = __msa_fill_w(
weight);
 
 1363     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1366     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1376     for (loop_cnt = 
height; loop_cnt--;) {
 
 1377         src0_ptr_tmp = src0_ptr;
 
 1379         src1_ptr_tmp = src1_ptr;
 
 1381         for (cnt = 2; cnt--;) {
 
 1385             LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
 
 1390                        vec0, vec1, vec2, vec3);
 
 1394                        vec0, vec1, vec2, vec3);
 
 1398                        vec0, vec1, vec2, vec3);
 
 1402                        vec0, vec1, vec2, vec3);
 
 1408                                weight_vec, rnd_vec, offset_vec,
 
 1409                                out0, out1, out2, out3);
 
 1412             ST_SH2(out0, out1, dst_tmp, 16);
 
 1416         src0_ptr += src_stride;
 
 1417         src1_ptr += src2_stride;
 
 1425                                     const int16_t *src1_ptr,
 
 1440     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1441     v16i8 src11, src12, src13, src14;
 
 1442     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 1443     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1444     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1445     v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
 
 1446     v16i8 src2110, src4332, src6554, src8776, src10998;
 
 1447     v16i8 src12111110, src14131312;
 
 1448     v8i16 dst10, dst32, dst54, dst76;
 
 1449     v8i16 filt0, filt1, filt2, filt3;
 
 1450     v8i16 filter_vec, out0, out1, out2, out3;
 
 1451     v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
 
 1453     src0_ptr -= (3 * src_stride);
 
 1454     offset = (offset0 + offset1) << rnd_val;
 
 1455     weight0 = weight0 & 0x0000FFFF;
 
 1456     weight = weight0 | (weight1 << 16);
 
 1458     const_vec = __msa_ldi_w(128);
 
 1460     offset_vec = __msa_fill_w(
offset);
 
 1461     weight_vec = __msa_fill_w(
weight);
 
 1462     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1463     weight1_vec = __msa_fill_w(weight1);
 
 1464     offset_vec += const_vec * weight1_vec;
 
 1467     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1470     src0_ptr += (7 * src_stride);
 
 1473                src10_r, src32_r, src54_r, src21_r);
 
 1474     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1475     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 
 1476                src2110, src4332, src6554);
 
 1479     for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
 1480         LD_SB8(src0_ptr, src_stride,
 
 1481                src7, src8, src9, src10, src11, src12, src13, src14);
 
 1482         src0_ptr += (8 * src_stride);
 
 1483         LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
 
 1484         src1_ptr += (8 * src2_stride);
 
 1488         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1489                    src76_r, src87_r, src98_r, src109_r);
 
 1490         ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
 
 1491                    src1110_r, src1211_r, src1312_r, src1413_r);
 
 1492         ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
 
 1493                    src1413_r, src1312_r,
 
 1494                    src8776, src10998, src12111110, src14131312);
 
 1497         DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
 
 1498                     filt0, dst10, dst32, dst54, dst76);
 
 1499         DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
 
 1500                      filt1, dst10, dst32, dst54, dst76);
 
 1501         DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
 
 1502                      filt2, filt2, dst10, dst32, dst54, dst76);
 
 1503         DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
 
 1504                      filt3, filt3, dst10, dst32, dst54, dst76);
 
 1508                            weight_vec, rnd_vec, offset_vec,
 
 1509                            out0, out1, out2, out3);
 
 1512         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 1513         dst += (8 * dst_stride);
 
 1516         src4332 = src12111110;
 
 1517         src6554 = src14131312;
 
 1521        LD_SB8(src0_ptr, src_stride,
 
 1522                src7, src8, src9, src10, src11, src12, src13, src14);
 
 1523         src0_ptr += (8 * src_stride);
 
 1524         LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
 
 1525         src1_ptr += (8 * src2_stride);
 
 1529         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1530                    src76_r, src87_r, src98_r, src109_r);
 
 1531         ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
 
 1532                    src1110_r, src1211_r, src1312_r, src1413_r);
 
 1533         ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
 
 1534                    src1413_r, src1312_r,
 
 1535                    src8776, src10998, src12111110, src14131312);
 
 1538         DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
 
 1539                     filt0, dst10, dst32, dst54, dst76);
 
 1540         DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
 
 1541                      filt1, dst10, dst32, dst54, dst76);
 
 1542         DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
 
 1543                      filt2, filt2, dst10, dst32, dst54, dst76);
 
 1544         DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
 
 1545                      filt3, filt3, dst10, dst32, dst54, dst76);
 
 1549                            weight_vec, rnd_vec, offset_vec,
 
 1550                            out0, out1, out2, out3);
 
 1553         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 1556         src4332 = src12111110;
 
 1557         src6554 = src14131312;
 
 1564                                     const int16_t *src1_ptr,
 
 1579     v16i8 src6, src7, src8, src9, src10;
 
 1580     v8i16 in0, in1, in2, in3;
 
 1581     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1582     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1583     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 1584     v8i16 filt0, filt1, filt2, filt3;
 
 1585     v8i16 filter_vec, out0, out1, out2, out3;
 
 1586     v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
 
 1588     src0_ptr -= (3 * src_stride);
 
 1589     offset = (offset0 + offset1) << rnd_val;
 
 1590     weight0 = weight0 & 0x0000FFFF;
 
 1591     weight = weight0 | (weight1 << 16);
 
 1593     const_vec = __msa_ldi_w(128);
 
 1595     offset_vec = __msa_fill_w(
offset);
 
 1596     weight_vec = __msa_fill_w(
weight);
 
 1597     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1598     weight1_vec = __msa_fill_w(weight1);
 
 1599     offset_vec += const_vec * weight1_vec;
 
 1602     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1605     src0_ptr += (7 * src_stride);
 
 1609                src10_r, src32_r, src54_r, src21_r);
 
 1610     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1612     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1613         LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
 
 1614         src0_ptr += (4 * src_stride);
 
 1615         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 1616         src1_ptr += (4 * src2_stride);
 
 1619         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1620                    src76_r, src87_r, src98_r, src109_r);
 
 1622         DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
 
 1623                     filt0, tmp0, tmp1, tmp2, tmp3);
 
 1624         DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
 
 1625                      filt1, tmp0, tmp1, tmp2, tmp3);
 
 1626         DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
 
 1627                      filt2, tmp0, tmp1, tmp2, tmp3);
 
 1628         DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
 
 1629                      filt3, tmp0, tmp1, tmp2, tmp3);
 
 1633                            weight_vec, rnd_vec, offset_vec,
 
 1634                            out0, out1, out2, out3);
 
 1637         ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
 1638         dst += (4 * dst_stride);
 
 1652                                      const int16_t *src1_ptr,
 
 1666     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1667     v8i16 in0, in1, in2, in3;
 
 1668     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 1669     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 1670     v8i16 tmp0, tmp1, tmp2;
 
 1671     v16i8 src10_l, src32_l, src54_l, src76_l;
 
 1672     v16i8 src21_l, src43_l, src65_l, src87_l;
 
 1673     v16i8 src2110, src4332, src6554, src8776;
 
 1674     v8i16 filt0, filt1, filt2, filt3;
 
 1675     v8i16 out0, out1, out2, filter_vec;
 
 1676     v4i32 dst2_r, dst2_l;
 
 1677     v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
 
 1679     src0_ptr -= (3 * src_stride);
 
 1680     offset = (offset0 + offset1) << rnd_val;
 
 1681     weight0 = weight0 & 0x0000FFFF;
 
 1682     weight = weight0 | (weight1 << 16);
 
 1684     const_vec = __msa_ldi_w(128);
 
 1686     offset_vec = __msa_fill_w(
offset);
 
 1687     weight_vec = __msa_fill_w(
weight);
 
 1688     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1689     weight1_vec = __msa_fill_w(weight1);
 
 1690     offset_vec += const_vec * weight1_vec;
 
 1693     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1696     src0_ptr += (7 * src_stride);
 
 1700                src10_r, src32_r, src54_r, src21_r);
 
 1701     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1703                src10_l, src32_l, src54_l, src21_l);
 
 1704     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1705     ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
 
 1706                src2110, src4332, src6554);
 
 1708     for (loop_cnt = 8; loop_cnt--;) {
 
 1709         LD_SB2(src0_ptr, src_stride, src7, src8);
 
 1710         src0_ptr += (2 * src_stride);
 
 1711         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 1712         LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
 
 1713         src1_ptr += (2 * src2_stride);
 
 1714         in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
 
 1717         ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 1718         ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
 
 1719         src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
 
 1721         DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
 
 1723         DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
 
 1724         tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
 
 1725         DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
 
 1726         tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
 
 1727         DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
 
 1728         tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
 
 1731                            weight_vec, rnd_vec, offset_vec,
 
 1735         dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
 
 1736                                  (v8i16) weight_vec);
 
 1737         dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
 
 1738                                  (v8i16) weight_vec);
 
 1741         out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
 
 1743         ST_D2(out0, 0, 1, 
dst, dst_stride);
 
 1744         ST_W2(out2, 0, 1, 
dst + 8, dst_stride);
 
 1745         dst += (2 * dst_stride);
 
 1762                                               const int16_t *src1_ptr,
 
 1775     const uint8_t *src0_ptr_tmp;
 
 1776     const int16_t *src1_ptr_tmp;
 
 1778     uint32_t loop_cnt, cnt;
 
 1780     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1781     v8i16 in0, in1, in2, in3;
 
 1782     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 1783     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 1784     v16i8 src10_l, src32_l, src54_l, src76_l;
 
 1785     v16i8 src21_l, src43_l, src65_l, src87_l;
 
 1786     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 1787     v8i16 filt0, filt1, filt2, filt3;
 
 1789     v8i16 out0, out1, out2, out3;
 
 1790     v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
 
 1792     src0_ptr -= (3 * src_stride);
 
 1794     offset = (offset0 + offset1) << rnd_val;
 
 1795     weight0 = weight0 & 0x0000FFFF;
 
 1796     weight = weight0 | (weight1 << 16);
 
 1798     const_vec = __msa_ldi_w(128);
 
 1800     offset_vec = __msa_fill_w(
offset);
 
 1801     weight_vec = __msa_fill_w(
weight);
 
 1802     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 1803     weight1_vec = __msa_fill_w(weight1);
 
 1804     offset_vec += const_vec * weight1_vec;
 
 1807     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1809     for (cnt = (
width >> 4); cnt--;) {
 
 1810         src0_ptr_tmp = src0_ptr;
 
 1811         src1_ptr_tmp = src1_ptr;
 
 1814         LD_SB7(src0_ptr_tmp, src_stride,
 
 1816         src0_ptr_tmp += (7 * src_stride);
 
 1820                    src10_r, src32_r, src54_r, src21_r);
 
 1821         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1823                    src10_l, src32_l, src54_l, src21_l);
 
 1824         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1826         for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
 1827             LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
 
 1828             src0_ptr_tmp += (2 * src_stride);
 
 1829             LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
 
 1830             LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
 
 1831             src1_ptr_tmp += (2 * src2_stride);
 
 1834             ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 1835             ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
 
 1837             DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
 
 1838                         filt0, filt0, tmp0, tmp1, tmp2, tmp3);
 
 1839             DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
 
 1840                          filt1, filt1, tmp0, tmp1, tmp2, tmp3);
 
 1841             DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
 
 1842                          filt2, filt2, tmp0, tmp1, tmp2, tmp3);
 
 1843             DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
 
 1844                          filt3, filt3, tmp0, tmp1, tmp2, tmp3);
 
 1848                                weight_vec, rnd_vec, offset_vec,
 
 1849                                out0, out1, out2, out3);
 
 1852             ST_SH2(out0, out1, dst_tmp, dst_stride);
 
 1853             dst_tmp += (2 * dst_stride);
 
 1878                                      const int16_t *src1_ptr,
 
 1891                                       src1_ptr, src2_stride,
 
 1893                                       weight0, weight1, offset0, offset1,
 
 1899                                      const int16_t *src1_ptr,
 
 1912                                       src1_ptr, src2_stride,
 
 1914                                       weight0, weight1, offset0, offset1,
 
 1917                             src1_ptr + 16, src2_stride,
 
 1919                             weight0, weight1, offset0, offset1, rnd_val);
 
 1924                                      const int16_t *src1_ptr,
 
 1937                                       src1_ptr, src2_stride,
 
 1939                                       weight0, weight1, offset0, offset1,
 
 1945                                      const int16_t *src1_ptr,
 
 1958                                       src1_ptr, src2_stride,
 
 1960                                       weight0, weight1, offset0, offset1,
 
 1966                                      const int16_t *src1_ptr,
 
 1979                                       src1_ptr, src2_stride,
 
 1981                                       weight0, weight1, offset0, offset1,
 
 1987                                     const int16_t *src1_ptr,
 
 1991                                     const int8_t *filter_x,
 
 1992                                     const int8_t *filter_y,
 
 2004     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2005     v8i16 in0 = { 0 }, in1 = { 0 };
 
 2006     v8i16 filt0, filt1, filt2, filt3;
 
 2007     v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 
 2008     v16i8 mask1, mask2, mask3;
 
 2009     v8i16 filter_vec, weight_vec;
 
 2010     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2011     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 2012     v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
 
 2013     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 2014     v8i16 dst10, dst32, dst54, dst76;
 
 2015     v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
 
 2016     v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
 
 2019     src0_ptr -= ((3 * src_stride) + 3);
 
 2021     filter_vec = 
LD_SH(filter_x);
 
 2022     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 2024     filter_vec = 
LD_SH(filter_y);
 
 2027     SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 2033     offset = (offset0 + offset1) << rnd_val;
 
 2034     weight0 = weight0 & 0x0000FFFF;
 
 2035     weight = weight0 | (weight1 << 16);
 
 2037     const_vec = __msa_fill_w((128 * weight1));
 
 2039     offset_vec = __msa_fill_w(
offset);
 
 2040     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2041     offset_vec += const_vec;
 
 2042     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 2045     src0_ptr += (7 * src_stride);
 
 2049     VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 2050     VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 2052                vec8, vec9, vec10, vec11);
 
 2053     VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
 
 2054                vec12, vec13, vec14, vec15);
 
 2069     dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
 2071     for (loop_cnt = 
height >> 2; loop_cnt--;) {
 
 2072         LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
 
 2073         src0_ptr += (4 * src_stride);
 
 2076         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 2078         src1_ptr += (2 * src2_stride);
 
 2079         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 2081         src1_ptr += (2 * src2_stride);
 
 2083         VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
 
 2084                    vec0, vec1, vec2, vec3);
 
 2085         VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
 
 2086                    vec4, vec5, vec6, vec7);
 
 2092         dst76 = __msa_ilvr_h(dst97, dst66);
 
 2094         dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
 
 2095         dst98 = __msa_ilvr_h(dst66, dst108);
 
 2097         dst0 = 
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
 
 2099         dst1 = 
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
 
 2101         dst2 = 
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
 
 2103         dst3 = 
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
 
 2105         SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 2109         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 2110         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 2111         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 2112         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 2116         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 2118         dst += (4 * dst_stride);
 
 2126         dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
 
 2132                                              const int16_t *src1_ptr,
 
 2136                                              const int8_t *filter_x,
 
 2137                                              const int8_t *filter_y,
 
 2146     uint32_t loop_cnt, cnt;
 
 2148     const uint8_t *src0_ptr_tmp;
 
 2149     const int16_t *src1_ptr_tmp;
 
 2152     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2154     v8i16 filt0, filt1, filt2, filt3;
 
 2155     v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 
 2157     v16i8 mask1, mask2, mask3;
 
 2158     v8i16 filter_vec, weight_vec;
 
 2159     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2160     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 2161     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 2162     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 2163     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 2164     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 2165     v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
 
 2166     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 2167     v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
 
 2168     v4i32 offset_vec, rnd_vec, const_vec;
 
 2170     src0_ptr -= ((3 * src_stride) + 3);
 
 2172     offset = (offset0 + offset1) << rnd_val;
 
 2173     weight0 = weight0 & 0x0000FFFF;
 
 2174     weight = weight0 | (weight1 << 16);
 
 2176     const_vec = __msa_fill_w((128 * weight1));
 
 2178     offset_vec = __msa_fill_w(
offset);
 
 2179     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2180     offset_vec += const_vec;
 
 2181     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 2183     filter_vec = 
LD_SH(filter_x);
 
 2184     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 2186     filter_vec = 
LD_SH(filter_y);
 
 2189     SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 2195     for (cnt = width8mult; cnt--;) {
 
 2196         src0_ptr_tmp = src0_ptr;
 
 2197         src1_ptr_tmp = src1_ptr;
 
 2200         LD_SB7(src0_ptr_tmp, src_stride,
 
 2202         src0_ptr_tmp += (7 * src_stride);
 
 2208                    vec0, vec1, vec2, vec3);
 
 2210                    vec4, vec5, vec6, vec7);
 
 2212                    vec8, vec9, vec10, vec11);
 
 2213         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 2214                    vec12, vec13, vec14, vec15);
 
 2226         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 2227                    vec0, vec1, vec2, vec3);
 
 2228         VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
 
 2229                    vec4, vec5, vec6, vec7);
 
 2230         VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
 
 2231                    vec8, vec9, vec10, vec11);
 
 2240         for (loop_cnt = 
height >> 1; loop_cnt--;) {
 
 2241             LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
 
 2243             src0_ptr_tmp += 2 * src_stride;
 
 2245             LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
 
 2246             src1_ptr_tmp += (2 * src2_stride);
 
 2248             ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
 
 2249                        dst32_r, dst54_r, dst21_r);
 
 2250             ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
 
 2251                        dst32_l, dst54_l, dst21_l);
 
 2252             ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
 
 2253             ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
 
 2255             VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
 
 2256                        vec0, vec1, vec2, vec3);
 
 2262                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2264                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2270             VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
 
 2271                        vec0, vec1, vec2, vec3);
 
 2277                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2279                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2284             PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
 
 2287             dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 2288             dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 2289             dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 2290             dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 2291             SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
 
 2293             PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
 
 2294             out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 2295             ST_D2(
out, 0, 1, dst_tmp, dst_stride);
 
 2296             dst_tmp += (2 * dst_stride);
 
 2315                                     const int16_t *src1_ptr,
 
 2319                                     const int8_t *filter_x,
 
 2320                                     const int8_t *filter_y,
 
 2329                                      src1_ptr, src2_stride,
 
 2330                                      dst, dst_stride, filter_x, filter_y,
 
 2331                                      height, weight0, weight1, offset0,
 
 2332                                      offset1, rnd_val, 1);
 
 2337                                      const int16_t *src1_ptr,
 
 2341                                      const int8_t *filter_x,
 
 2342                                      const int8_t *filter_y,
 
 2351     const uint8_t *src0_ptr_tmp;
 
 2353     const int16_t *src1_ptr_tmp;
 
 2357     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2358     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2359     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 2360     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 2361     v8i16 in0 = { 0 }, in1 = { 0 };
 
 2362     v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
 
 2363     v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
 
 2364     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
 
 2365     v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
 
 2366     v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
 
 2367     v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
 
 2368     v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
 
 2369     v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
 
 2371     src0_ptr -= ((3 * src_stride) + 3);
 
 2373     offset = (offset0 + offset1) << rnd_val;
 
 2374     weight0 = weight0 & 0x0000FFFF;
 
 2375     weight = weight0 | (weight1 << 16);
 
 2377     const_vec = __msa_fill_w((128 * weight1));
 
 2379     offset_vec = __msa_fill_w(
offset);
 
 2380     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2381     offset_vec += const_vec;
 
 2382     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 2384     filter_vec = 
LD_SH(filter_x);
 
 2385     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 2387     filter_vec = 
LD_SH(filter_y);
 
 2390     SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 2397     src0_ptr_tmp = src0_ptr;
 
 2398     src1_ptr_tmp = src1_ptr;
 
 2402     src0_ptr_tmp += (7 * src_stride);
 
 2409     VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
 
 2419     VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 2420     VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 2421     VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
 
 2430     for (loop_cnt = 8; loop_cnt--;) {
 
 2431         LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
 
 2432         src0_ptr_tmp += (2 * src_stride);
 
 2435         LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
 
 2436         src1_ptr_tmp += (2 * src2_stride);
 
 2438         ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
 
 2439                    dst10_r, dst32_r, dst54_r, dst21_r);
 
 2440         ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
 
 2441                    dst10_l, dst32_l, dst54_l, dst21_l);
 
 2442         ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
 
 2443         ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
 
 2445         VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
 
 2451         dst0 = 
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
 
 2452                               filt_h1, filt_h2, filt_h3);
 
 2453         dst1 = 
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
 
 2454                               filt_h1, filt_h2, filt_h3);
 
 2458         VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
 
 2464         dst2 = 
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
 
 2465                               filt_h1, filt_h2, filt_h3);
 
 2466         dst3 = 
HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
 
 2467                               filt_h1, filt_h2, filt_h3);
 
 2474         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 2475         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 2476         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 2477         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 2481         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 2482         ST_D2(
out, 0, 1, dst_tmp, dst_stride);
 
 2483         dst_tmp += (2 * dst_stride);
 
 2504     src0_ptr += (7 * src_stride);
 
 2507     VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
 
 2508     VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
 
 2509     VSHF_B4_SB(
src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
 
 2511     VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
 
 2525     dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
 2527     for (loop_cnt = 4; loop_cnt--;) {
 
 2528         LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
 
 2529         src0_ptr += (4 * src_stride);
 
 2532         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 2534         src1_ptr += (2 * src2_stride);
 
 2535         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 2537         src1_ptr += (2 * src2_stride);
 
 2539         VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
 
 2541         VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
 
 2548         dst76 = __msa_ilvr_h(dst97, dst66);
 
 2550         dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
 
 2551         dst98 = __msa_ilvr_h(dst66, dst108);
 
 2553         dst0 = 
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
 
 2555         dst1 = 
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
 
 2557         dst2 = 
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
 
 2559         dst3 = 
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
 
 2561         SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 2565         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 2566         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 2567         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 2568         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 2572         out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 2574         dst += (4 * dst_stride);
 
 2582         dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
 
 2588                                      const int16_t *src1_ptr,
 
 2592                                      const int8_t *filter_x,
 
 2593                                      const int8_t *filter_y,
 
 2602                                      src1_ptr, src2_stride,
 
 2603                                      dst, dst_stride, filter_x, filter_y,
 
 2604                                      height, weight0, weight1, offset0,
 
 2605                                      offset1, rnd_val, 2);
 
 2610                                      const int16_t *src1_ptr,
 
 2614                                      const int8_t *filter_x,
 
 2615                                      const int8_t *filter_y,
 
 2624                                      src1_ptr, src2_stride,
 
 2625                                      dst, dst_stride, filter_x, filter_y,
 
 2626                                      height, weight0, weight1, offset0,
 
 2627                                      offset1, rnd_val, 3);
 
 2632                                      const int16_t *src1_ptr,
 
 2636                                      const int8_t *filter_x,
 
 2637                                      const int8_t *filter_y,
 
 2646                                      src1_ptr, src2_stride,
 
 2647                                      dst, dst_stride, filter_x, filter_y,
 
 2648                                      height, weight0, weight1, offset0,
 
 2649                                      offset1, rnd_val, 4);
 
 2654                                      const int16_t *src1_ptr,
 
 2658                                      const int8_t *filter_x,
 
 2659                                      const int8_t *filter_y,
 
 2668                                      src1_ptr, src2_stride,
 
 2669                                      dst, dst_stride, filter_x, filter_y,
 
 2670                                      height, weight0, weight1, offset0,
 
 2671                                      offset1, rnd_val, 6);
 
 2676                                      const int16_t *src1_ptr,
 
 2680                                      const int8_t *filter_x,
 
 2681                                      const int8_t *filter_y,
 
 2690                                      src1_ptr, src2_stride,
 
 2691                                      dst, dst_stride, filter_x, filter_y,
 
 2692                                      height, weight0, weight1, offset0,
 
 2693                                      offset1, rnd_val, 8);
 
 2698                                      const int16_t *src1_ptr,
 
 2714     v16i8 mask1, vec0, vec1;
 
 2716     v4i32 dst0_r, dst0_l;
 
 2717     v8i16 out0, filter_vec;
 
 2718     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2727     offset = (offset0 + offset1) << rnd_val;
 
 2728     weight0 = weight0 & 0x0000FFFF;
 
 2729     weight = weight0 | (weight1 << 16);
 
 2730     constant = 128 * weight1;
 
 2734     offset_vec = __msa_fill_w(
offset);
 
 2735     weight_vec = __msa_fill_w(
weight);
 
 2736     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2739     LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 2740     in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
 
 2747     dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
 
 2748     dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
 
 2750     out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
 2752     out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
 
 2753     ST_W2(out0, 0, 1, 
dst, dst_stride);
 
 2758                                      const int16_t *src1_ptr,
 
 2776     v8i16 in0, in1, in2, in3;
 
 2778     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2788     offset = (offset0 + offset1) << rnd_val;
 
 2789     weight0 = weight0 & 0x0000FFFF;
 
 2790     weight = weight0 | (weight1 << 16);
 
 2791     constant = 128 * weight1;
 
 2795     offset_vec = __msa_fill_w(
offset);
 
 2796     weight_vec = __msa_fill_w(
weight);
 
 2797     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2801     LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 2809                        weight_vec, rnd_vec, offset_vec,
 
 2812     dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 2813     ST_W4(dst0, 0, 1, 2, 3, 
dst, dst_stride);
 
 2818                                              const int16_t *src1_ptr,
 
 2837     v8i16 dst0, dst1, dst2, dst3;
 
 2838     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 2840     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2847     offset = (offset0 + offset1) << rnd_val;
 
 2848     weight0 = weight0 & 0x0000FFFF;
 
 2849     weight = weight0 | (weight1 << 16);
 
 2850     constant = 128 * weight1;
 
 2854     offset_vec = __msa_fill_w(
offset);
 
 2855     weight_vec = __msa_fill_w(
weight);
 
 2856     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2860     for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
 2861         LD_SB8(src0_ptr, src_stride,
 
 2863         src0_ptr += (8 * src_stride);
 
 2864         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 2865         src1_ptr += (4 * src2_stride);
 
 2866         LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
 
 2867         src1_ptr += (4 * src2_stride);
 
 2876         VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
 
 2878         VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
 
 2882                            weight_vec, rnd_vec, offset_vec,
 
 2883                            dst0, dst1, dst2, dst3);
 
 2886         ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 2887         dst += (8 * dst_stride);
 
 2893                                     const int16_t *src1_ptr,
 
 2908                                  weight0, weight1, offset0, offset1, rnd_val);
 
 2909     } 
else if (4 == 
height) {
 
 2912                                  weight0, weight1, offset0, offset1, rnd_val);
 
 2913     } 
else if (0 == (
height % 8)) {
 
 2915                                          src1_ptr, src2_stride,
 
 2917                                          weight0, weight1, offset0, offset1,
 
 2924                                     const int16_t *src1_ptr,
 
 2943     v8i16 in0, in1, in2, in3;
 
 2944     v8i16 dst0, dst1, dst2, dst3;
 
 2946     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2953     offset = (offset0 + offset1) << rnd_val;
 
 2954     weight0 = weight0 & 0x0000FFFF;
 
 2955     weight = weight0 | (weight1 << 16);
 
 2956     constant = 128 * weight1;
 
 2960     offset_vec = __msa_fill_w(
offset);
 
 2961     weight_vec = __msa_fill_w(
weight);
 
 2962     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 2966     for (loop_cnt = 2; loop_cnt--;) {
 
 2968         src0_ptr += (4 * src_stride);
 
 2969         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 2970         src1_ptr += (4 * src2_stride);
 
 2979         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2984                            weight_vec, rnd_vec, offset_vec,
 
 2985                            dst0, dst1, dst2, dst3);
 
 2988         ST_W2(dst0, 0, 2, 
dst, dst_stride);
 
 2989         ST_H2(dst0, 2, 6, 
dst + 4, dst_stride);
 
 2990         ST_W2(dst1, 0, 2, 
dst + 2 * dst_stride, dst_stride);
 
 2991         ST_H2(dst1, 2, 6, 
dst + 2 * dst_stride + 4, dst_stride);
 
 2992         dst += (4 * dst_stride);
 
 2998                                      const int16_t *src1_ptr,
 
 3014     v16i8 mask1, vec0, vec1;
 
 3017     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3024     offset = (offset0 + offset1) << rnd_val;
 
 3025     weight0 = weight0 & 0x0000FFFF;
 
 3026     weight = weight0 | (weight1 << 16);
 
 3027     constant = 128 * weight1;
 
 3031     offset_vec = __msa_fill_w(
offset);
 
 3032     weight_vec = __msa_fill_w(
weight);
 
 3033     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3038     LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 3045                        weight_vec, rnd_vec, offset_vec,
 
 3048     dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 3049     ST_D2(dst0, 0, 1, 
dst, dst_stride);
 
 3054                                      const int16_t *src1_ptr,
 
 3068     v8i16 in0, in1, in2, in3, in4, in5;
 
 3072     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3074     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3081     offset = (offset0 + offset1) << rnd_val;
 
 3082     weight0 = weight0 & 0x0000FFFF;
 
 3083     weight = weight0 | (weight1 << 16);
 
 3084     constant = 128 * weight1;
 
 3088     offset_vec = __msa_fill_w(
offset);
 
 3089     weight_vec = __msa_fill_w(
weight);
 
 3090     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3096     LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 3097     src1_ptr += (4 * src2_stride);
 
 3098     LD_SH2(src1_ptr, src2_stride, in4, in5);
 
 3106     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3108     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3110     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3114                        weight_vec, rnd_vec, offset_vec,
 
 3115                        dst0, dst1, dst2, dst3);
 
 3117                        weight_vec, rnd_vec, offset_vec,
 
 3121     dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
 
 3122     ST_D4(dst0, dst1, 0, 1, 0, 1, 
dst, dst_stride);
 
 3123     ST_D2(dst3, 0, 1, 
dst + 4 * dst_stride, dst_stride);
 
 3128                                              const int16_t *src1_ptr,
 
 3147     v8i16 in0, in1, in2, in3;
 
 3148     v8i16 dst0, dst1, dst2, dst3;
 
 3150     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3157     offset = (offset0 + offset1) << rnd_val;
 
 3158     weight0 = weight0 & 0x0000FFFF;
 
 3159     weight = weight0 | (weight1 << 16);
 
 3160     constant = 128 * weight1;
 
 3164     offset_vec = __msa_fill_w(
offset);
 
 3165     weight_vec = __msa_fill_w(
weight);
 
 3166     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3170     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3172         src0_ptr += (4 * src_stride);
 
 3173         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 3174         src1_ptr += (4 * src2_stride);
 
 3183         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3187                            weight_vec, rnd_vec, offset_vec,
 
 3188                            dst0, dst1, dst2, dst3);
 
 3191         ST_D4(dst0, dst1, 0, 1, 0, 1, 
dst, dst_stride);
 
 3192         dst += (4 * dst_stride);
 
 3198                                     const int16_t *src1_ptr,
 
 3213                                  weight0, weight1, offset0, offset1, rnd_val);
 
 3214     } 
else if (6 == 
height) {
 
 3217                                  weight0, weight1, offset0, offset1, rnd_val);
 
 3218     } 
else if (0 == (
height % 4)) {
 
 3220                                          src1_ptr, src2_stride,
 
 3222                                          weight0, weight1, offset0, offset1,
 
 3229                                      const int16_t *src1_ptr,
 
 3245     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 3248         8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 
 3252     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3254     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3261     offset = (offset0 + offset1) << rnd_val;
 
 3262     weight0 = weight0 & 0x0000FFFF;
 
 3263     weight = weight0 | (weight1 << 16);
 
 3264     constant = 128 * weight1;
 
 3268     offset_vec = __msa_fill_w(
offset);
 
 3269     weight_vec = __msa_fill_w(
weight);
 
 3270     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3275     for (loop_cnt = 4; loop_cnt--;) {
 
 3277         src0_ptr += (4 * src_stride);
 
 3278         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 3279         LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
 
 3280         src1_ptr += (4 * src2_stride);
 
 3290         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3299                            weight_vec, rnd_vec, offset_vec,
 
 3300                            dst0, dst1, dst2, dst3);
 
 3302                            weight_vec, rnd_vec, offset_vec,
 
 3306         dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
 
 3307         ST_D4(dst0, dst1, 0, 1, 0, 1, 
dst, dst_stride);
 
 3308         ST_W4(dst3, 0, 1, 2, 3, 
dst + 8, dst_stride);
 
 3309         dst += (4 * dst_stride);
 
 3315                                      const int16_t *src1_ptr,
 
 3330     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 3334     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3337     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3344     offset = (offset0 + offset1) << rnd_val;
 
 3345     weight0 = weight0 & 0x0000FFFF;
 
 3346     weight = weight0 | (weight1 << 16);
 
 3347     constant = 128 * weight1;
 
 3351     offset_vec = __msa_fill_w(
offset);
 
 3352     weight_vec = __msa_fill_w(
weight);
 
 3353     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3357     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3359         LD_SB4(src0_ptr + 8, src_stride, 
src1, src3, src5, src7);
 
 3360         src0_ptr += (4 * src_stride);
 
 3361         LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
 
 3362         LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
 
 3363         src1_ptr += (4 * src2_stride);
 
 3372         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3374         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3376         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3378         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 3380         VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 3384                            weight_vec, rnd_vec, offset_vec,
 
 3385                            dst0, dst1, dst2, dst3);
 
 3389         dst += (2 * dst_stride);
 
 3393                            weight_vec, rnd_vec, offset_vec,
 
 3394                            dst0, dst1, dst2, dst3);
 
 3398         dst += (2 * dst_stride);
 
 3404                                      const int16_t *src1_ptr,
 
 3421     v16i8 mask1, mask2, mask3;
 
 3423     v8i16 dst0, dst1, dst2, dst3;
 
 3424     v8i16 in0, in1, in2, in3, in4, in5;
 
 3426     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3433     offset = (offset0 + offset1) << rnd_val;
 
 3434     weight0 = weight0 & 0x0000FFFF;
 
 3435     weight = weight0 | (weight1 << 16);
 
 3436     constant = 128 * weight1;
 
 3440     offset_vec = __msa_fill_w(
offset);
 
 3441     weight_vec = __msa_fill_w(
weight);
 
 3442     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3448     for (loop_cnt = 16; loop_cnt--;) {
 
 3450         LD_SB2(src0_ptr + 16, src_stride, 
src1, src3);
 
 3451         src0_ptr += (2 * src_stride);
 
 3452         LD_SH2(src1_ptr, src2_stride, in0, in2);
 
 3453         LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
 
 3454         LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
 
 3455         src1_ptr += (2 * src2_stride);
 
 3468                            weight_vec, rnd_vec, offset_vec,
 
 3469                            dst0, dst1, dst2, dst3);
 
 3477         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3480                            weight_vec, rnd_vec, offset_vec,
 
 3483         dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 3484         ST_D2(dst0, 0, 1, (
dst + 16), dst_stride);
 
 3485         dst += (2 * dst_stride);
 
 3491                                      const int16_t *src1_ptr,
 
 3508     v16i8 mask1, mask2, mask3;
 
 3509     v8i16 dst0, dst1, dst2, dst3;
 
 3511     v8i16 in0, in1, in2, in3;
 
 3513     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3520     offset = (offset0 + offset1) << rnd_val;
 
 3521     weight0 = weight0 & 0x0000FFFF;
 
 3522     weight = weight0 | (weight1 << 16);
 
 3523     constant = 128 * weight1;
 
 3527     offset_vec = __msa_fill_w(
offset);
 
 3528     weight_vec = __msa_fill_w(
weight);
 
 3529     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3535     for (loop_cnt = 
height; loop_cnt--;) {
 
 3538         src0_ptr += src_stride;
 
 3539         LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
 
 3540         src1_ptr += src2_stride;
 
 3553                            weight_vec, rnd_vec, offset_vec,
 
 3554                            dst0, dst1, dst2, dst3);
 
 3564                                      const int16_t *src1_ptr,
 
 3577     v8i16 in0, in1, dst10;
 
 3578     v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
 
 3579     v4i32 dst10_r, dst10_l;
 
 3581     v8i16 filter_vec, 
out;
 
 3582     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3584     src0_ptr -= src_stride;
 
 3586     offset = (offset0 + offset1) << rnd_val;
 
 3587     weight0 = weight0 & 0x0000FFFF;
 
 3588     weight = weight0 | (weight1 << 16);
 
 3589     constant = 128 * weight1;
 
 3593     offset_vec = __msa_fill_w(
offset);
 
 3594     weight_vec = __msa_fill_w(
weight);
 
 3595     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3601     src0_ptr += (3 * src_stride);
 
 3603     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3604     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3605     LD_SB2(src0_ptr, src_stride, src3, src4);
 
 3606     src0_ptr += (2 * src_stride);
 
 3607     LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 3608     src1_ptr += (2 * src2_stride);
 
 3610     in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
 
 3612     src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
 
 3613     src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
 
 3618     dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
 
 3619     dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
 
 3622     out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
 
 3623     out = (v8i16) __msa_pckev_b((v16i8) 
out, (v16i8) 
out);
 
 3629                                      const int16_t *src1_ptr,
 
 3642     v8i16 in0, in1, in2, in3;
 
 3643     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
 
 3644     v16i8 src2110, src4332, src6554;
 
 3648     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3650     src0_ptr -= src_stride;
 
 3652     offset = (offset0 + offset1) << rnd_val;
 
 3653     weight0 = weight0 & 0x0000FFFF;
 
 3654     weight = weight0 | (weight1 << 16);
 
 3655     constant = 128 * weight1;
 
 3659     offset_vec = __msa_fill_w(
offset);
 
 3660     weight_vec = __msa_fill_w(
weight);
 
 3661     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3667     src0_ptr += (3 * src_stride);
 
 3669     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3670     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3672     LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
 
 3673     src0_ptr += (4 * src_stride);
 
 3674     LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 3675     src1_ptr += (4 * src2_stride);
 
 3678                src32_r, src43_r, src54_r, src65_r);
 
 3679     ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
 
 3686                        weight_vec, rnd_vec, offset_vec,
 
 3689     dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
 
 3690     ST_W4(dst10, 0, 1, 2, 3, 
dst, dst_stride);
 
 3691     dst += (4 * dst_stride);
 
 3696                                              const int16_t *src1_ptr,
 
 3710     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9;
 
 3711     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 3712     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 3713     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 3714     v16i8 src2110, src4332, src6554, src8776;
 
 3715     v8i16 dst10, dst32, dst54, dst76;
 
 3718     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3720     src0_ptr -= src_stride;
 
 3722     offset = (offset0 + offset1) << rnd_val;
 
 3723     weight0 = weight0 & 0x0000FFFF;
 
 3724     weight = weight0 | (weight1 << 16);
 
 3725     constant = 128 * weight1;
 
 3729     offset_vec = __msa_fill_w(
offset);
 
 3730     weight_vec = __msa_fill_w(
weight);
 
 3731     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3737     src0_ptr += (3 * src_stride);
 
 3739     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3740     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3742     for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
 3743         LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
 
 3744         src0_ptr += (6 * src_stride);
 
 3745         LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
 
 3746         src1_ptr += (8 * src2_stride);
 
 3752                    src32_r, src43_r, src54_r, src65_r);
 
 3753         ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3754         ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
 
 3755                    src4332, src6554, src8776);
 
 3763         src0_ptr += (2 * src_stride);
 
 3765         src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
 
 3766         src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3771                            weight_vec, rnd_vec, offset_vec,
 
 3772                            dst10, dst32, dst54, dst76);
 
 3774         PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
 
 3775         ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 3776         dst += (8 * dst_stride);
 
 3782                                     const int16_t *src1_ptr,
 
 3797                                  weight0, weight1, offset0, offset1, rnd_val);
 
 3798     } 
else if (4 == 
height) {
 
 3801                                  weight0, weight1, offset0, offset1, rnd_val);
 
 3802     } 
else if (0 == (
height % 8)) {
 
 3804                                          src1_ptr, src2_stride,
 
 3806                                          weight0, weight1, offset0, offset1,
 
 3813                                     const int16_t *src1_ptr,
 
 3829     v8i16 in0, in1, in2, in3;
 
 3830     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3831     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 3834     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3836     src0_ptr -= src_stride;
 
 3838     offset = (offset0 + offset1) << rnd_val;
 
 3839     weight0 = weight0 & 0x0000FFFF;
 
 3840     weight = weight0 | (weight1 << 16);
 
 3841     constant = 128 * weight1;
 
 3845     offset_vec = __msa_fill_w(
offset);
 
 3846     weight_vec = __msa_fill_w(
weight);
 
 3847     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3853     src0_ptr += (3 * src_stride);
 
 3857     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3858         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 3859         src0_ptr += (2 * src_stride);
 
 3860         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 3861         src1_ptr += (4 * src2_stride);
 
 3869         src0_ptr += (2 * src_stride);
 
 3877                            weight_vec, rnd_vec, offset_vec,
 
 3878                            tmp0, tmp1, tmp2, tmp3);
 
 3881         ST_W2(tmp0, 0, 2, 
dst, dst_stride);
 
 3882         ST_H2(tmp0, 2, 6, 
dst + 4, dst_stride);
 
 3883         ST_W2(tmp1, 0, 2, 
dst + 2 * dst_stride, dst_stride);
 
 3884         ST_H2(tmp1, 2, 6, 
dst + 2 * dst_stride + 4, dst_stride);
 
 3885         dst += (4 * dst_stride);
 
 3888         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 3889         src0_ptr += (2 * src_stride);
 
 3890         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 3891         src1_ptr += (4 * src2_stride);
 
 3899         src0_ptr += (2 * src_stride);
 
 3907                            weight_vec, rnd_vec, offset_vec,
 
 3908                            tmp0, tmp1, tmp2, tmp3);
 
 3911         ST_W2(tmp0, 0, 2, 
dst, dst_stride);
 
 3912         ST_H2(tmp0, 2, 6, 
dst + 4, dst_stride);
 
 3913         ST_W2(tmp1, 0, 2, 
dst + 2 * dst_stride, dst_stride);
 
 3914         ST_H2(tmp1, 2, 6, 
dst + 2 * dst_stride + 4, dst_stride);
 
 3920                                      const int16_t *src1_ptr,
 
 3933     v8i16 in0, in1, tmp0, tmp1;
 
 3934     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3937     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3939     src0_ptr -= src_stride;
 
 3941     offset = (offset0 + offset1) << rnd_val;
 
 3942     weight0 = weight0 & 0x0000FFFF;
 
 3943     weight = weight0 | (weight1 << 16);
 
 3944     constant = 128 * weight1;
 
 3948     offset_vec = __msa_fill_w(
offset);
 
 3949     weight_vec = __msa_fill_w(
weight);
 
 3950     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 3956     src0_ptr += (3 * src_stride);
 
 3960     LD_SB2(src0_ptr, src_stride, src3, src4);
 
 3961     LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 3968                        weight_vec, rnd_vec, offset_vec,
 
 3971     tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 3972     ST_D2(tmp0, 0, 1, 
dst, dst_stride);
 
 3977                                      const int16_t *src1_ptr,
 
 3989     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 3990     v8i16 in0, in1, in2, in3, in4, in5;
 
 3991     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 3992     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 3993     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 3996     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3998     src0_ptr -= src_stride;
 
 4000     offset = (offset0 + offset1) << rnd_val;
 
 4001     weight0 = weight0 & 0x0000FFFF;
 
 4002     weight = weight0 | (weight1 << 16);
 
 4003     constant = 128 * weight1;
 
 4007     offset_vec = __msa_fill_w(
offset);
 
 4008     weight_vec = __msa_fill_w(
weight);
 
 4009     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4015     src0_ptr += (3 * src_stride);
 
 4019     LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
 
 4020     LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
 
 4023                src32_r, src43_r, src54_r, src65_r);
 
 4024     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 4034                        weight_vec, rnd_vec, offset_vec,
 
 4035                        tmp0, tmp1, tmp2, tmp3);
 
 4037                        weight_vec, rnd_vec, offset_vec,
 
 4041     tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
 
 4042     ST_D4(tmp0, tmp1, 0, 1, 0, 1, 
dst, dst_stride);
 
 4043     ST_D2(tmp3, 0, 1, 
dst + 4 * dst_stride, dst_stride);
 
 4048                                              const int16_t *src1_ptr,
 
 4063     v8i16 in0, in1, in2, in3;
 
 4064     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 4065     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 4068     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4070     src0_ptr -= src_stride;
 
 4072     offset = (offset0 + offset1) << rnd_val;
 
 4073     weight0 = weight0 & 0x0000FFFF;
 
 4074     weight = weight0 | (weight1 << 16);
 
 4075     constant = 128 * weight1;
 
 4079     offset_vec = __msa_fill_w(
offset);
 
 4080     weight_vec = __msa_fill_w(
weight);
 
 4081     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4087     src0_ptr += (3 * src_stride);
 
 4091     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 4092         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 4093         src0_ptr += (2 * src_stride);
 
 4094         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 4095         src1_ptr += (4 * src2_stride);
 
 4103         src0_ptr += (2 * src_stride);
 
 4111                            weight_vec, rnd_vec, offset_vec,
 
 4112                            tmp0, tmp1, tmp2, tmp3);
 
 4115         ST_D4(tmp0, tmp1, 0, 1, 0, 1, 
dst, dst_stride);
 
 4116         dst += (4 * dst_stride);
 
 4122                                     const int16_t *src1_ptr,
 
 4137                                  weight0, weight1, offset0, offset1, rnd_val);
 
 4138     } 
else if (6 == 
height) {
 
 4141                                  weight0, weight1, offset0, offset1, rnd_val);
 
 4144                                          src1_ptr, src2_stride,
 
 4146                                          weight0, weight1, offset0, offset1,
 
 4153                                      const int16_t *src1_ptr,
 
 4168     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 4169     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 4170     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 4171     v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
 
 4172     v16i8 src2110, src4332;
 
 4175     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4177     src0_ptr -= (1 * src_stride);
 
 4179     offset = (offset0 + offset1) << rnd_val;
 
 4180     weight0 = weight0 & 0x0000FFFF;
 
 4181     weight = weight0 | (weight1 << 16);
 
 4182     constant = 128 * weight1;
 
 4186     offset_vec = __msa_fill_w(
offset);
 
 4187     weight_vec = __msa_fill_w(
weight);
 
 4188     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4194     src0_ptr += (3 * src_stride);
 
 4198     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
 
 4200     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 4201         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 4202         src0_ptr += (2 * src_stride);
 
 4203         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 4204         LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
 
 4205         src1_ptr += (4 * src2_stride);
 
 4211         src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
 
 4218         src0_ptr += (2 * src_stride);
 
 4222         src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
 
 4229                            weight_vec, rnd_vec, offset_vec,
 
 4230                            tmp0, tmp1, tmp2, tmp3);
 
 4232                            weight_vec, rnd_vec, offset_vec,
 
 4236         tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
 
 4237         ST_D4(tmp0, tmp1, 0, 1, 0, 1, 
dst, dst_stride);
 
 4238         ST_W4(tmp2, 0, 1, 2, 3, 
dst + 8, dst_stride);
 
 4239         dst += (4 * dst_stride);
 
 4245                                      const int16_t *src1_ptr,
 
 4260     v8i16 in0, in1, in2, in3;
 
 4261     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 4262     v16i8 src10_l, src32_l, src21_l, src43_l;
 
 4263     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 4266     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4268     src0_ptr -= src_stride;
 
 4270     offset = (offset0 + offset1) << rnd_val;
 
 4271     weight0 = weight0 & 0x0000FFFF;
 
 4272     weight = weight0 | (weight1 << 16);
 
 4273     constant = 128 * weight1;
 
 4277     offset_vec = __msa_fill_w(
offset);
 
 4278     weight_vec = __msa_fill_w(
weight);
 
 4279     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4285     src0_ptr += (3 * src_stride);
 
 4290     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 4291         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 4292         src0_ptr += (2 * src_stride);
 
 4293         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 4294         LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
 
 4295         src1_ptr += (2 * src2_stride);
 
 4307                            weight_vec, rnd_vec, offset_vec,
 
 4308                            tmp0, tmp1, tmp2, tmp3);
 
 4311         dst += (2 * dst_stride);
 
 4313         src0_ptr += (2 * src_stride);
 
 4315         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 4316         LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
 
 4317         src1_ptr += (2 * src2_stride);
 
 4328                            weight_vec, rnd_vec, offset_vec,
 
 4329                            tmp0, tmp1, tmp2, tmp3);
 
 4333         dst += (2 * dst_stride);
 
 4339                                      const int16_t *src1_ptr,
 
 4354     v16i8 src6, src7, src8, src9, src10, src11;
 
 4355     v8i16 in0, in1, in2, in3, in4, in5;
 
 4356     v16i8 src10_r, src32_r, src76_r, src98_r;
 
 4357     v16i8 src10_l, src32_l, src21_l, src43_l;
 
 4358     v16i8 src21_r, src43_r, src87_r, src109_r;
 
 4359     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 4362     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4364     src0_ptr -= src_stride;
 
 4366     offset = (offset0 + offset1) << rnd_val;
 
 4367     weight0 = weight0 & 0x0000FFFF;
 
 4368     weight = weight0 | (weight1 << 16);
 
 4369     constant = 128 * weight1;
 
 4373     offset_vec = __msa_fill_w(
offset);
 
 4374     weight_vec = __msa_fill_w(
weight);
 
 4375     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4386     LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
 
 4387     src0_ptr += (3 * src_stride);
 
 4389     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 4391     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 4393         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 4394         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 4395         LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
 
 4401         LD_SB2(src0_ptr + 16, src_stride, src9, src10);
 
 4402         src0_ptr += (2 * src_stride);
 
 4403         LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
 
 4404         src1_ptr += (2 * src2_stride);
 
 4406         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 4418                            weight_vec, rnd_vec, offset_vec,
 
 4419                            tmp0, tmp1, tmp4, tmp5);
 
 4422                            weight_vec, rnd_vec, offset_vec,
 
 4427         tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
 
 4429         ST_D2(tmp2, 0, 1, 
dst + 16, dst_stride);
 
 4430         dst += (2 * dst_stride);
 
 4434         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 4435         LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
 
 4440         LD_SB2(src0_ptr + 16, src_stride, src11, src8);
 
 4441         src0_ptr += (2 * src_stride);
 
 4442         LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
 
 4443         src1_ptr += (2 * src2_stride);
 
 4445         ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
 
 4457                            weight_vec, rnd_vec, offset_vec,
 
 4458                            tmp0, tmp1, tmp4, tmp5);
 
 4461                            weight_vec, rnd_vec, offset_vec,
 
 4467         tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
 
 4469         ST_D2(tmp2, 0, 1, 
dst + 16, dst_stride);
 
 4470         dst += (2 * dst_stride);
 
 4476                                      const int16_t *src1_ptr,
 
 4489     uint8_t *dst_tmp = 
dst + 16;
 
 4491     v16i8 
src0, 
src1, 
src2, src3, src4, src6, src7, src8, src9, src10;
 
 4492     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 
 4493     v16i8 src10_r, src32_r, src76_r, src98_r;
 
 4494     v16i8 src21_r, src43_r, src87_r, src109_r;
 
 4495     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 4496     v16i8 src10_l, src32_l, src76_l, src98_l;
 
 4497     v16i8 src21_l, src43_l, src87_l, src109_l;
 
 4500     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4502     src0_ptr -= src_stride;
 
 4504     offset = (offset0 + offset1) << rnd_val;
 
 4505     weight0 = weight0 & 0x0000FFFF;
 
 4506     weight = weight0 | (weight1 << 16);
 
 4507     constant = 128 * weight1;
 
 4511     offset_vec = __msa_fill_w(
offset);
 
 4512     weight_vec = __msa_fill_w(
weight);
 
 4513     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4524     LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
 
 4525     src0_ptr += (3 * src_stride);
 
 4527     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 4528     ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
 
 4530     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
 4532         LD_SB2(src0_ptr, src_stride, src3, src4);
 
 4533         LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 4534         LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
 
 4547                            weight_vec, rnd_vec, offset_vec,
 
 4548                            tmp0, tmp1, tmp4, tmp5);
 
 4552         dst += (2 * dst_stride);
 
 4561         LD_SB2(src0_ptr + 16, src_stride, src9, src10);
 
 4562         src0_ptr += (2 * src_stride);
 
 4563         LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
 
 4564         LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
 
 4565         src1_ptr += (2 * src2_stride);
 
 4567         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 4568         ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
 
 4577                            weight_vec, rnd_vec, offset_vec,
 
 4578                            tmp2, tmp3, tmp6, tmp7);
 
 4582         ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
 
 4583         dst_tmp += (2 * dst_stride);
 
 4595                                      const int16_t *src1_ptr,
 
 4599                                      const int8_t *filter_x,
 
 4600                                      const int8_t *filter_y,
 
 4613     v8i16 filt_h0, filt_h1;
 
 4616     v8i16 filter_vec, 
tmp, weight_vec;
 
 4617     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4618     v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
 
 4619     v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
 
 4621     src0_ptr -= (src_stride + 1);
 
 4623     filter_vec = 
LD_SH(filter_x);
 
 4626     filter_vec = 
LD_SH(filter_y);
 
 4633     offset = (offset0 + offset1) << rnd_val;
 
 4634     weight0 = weight0 & 0x0000FFFF;
 
 4635     weight = weight0 | (weight1 << 16);
 
 4637     const_vec = __msa_fill_w((128 * weight1));
 
 4639     offset_vec = __msa_fill_w(
offset);
 
 4640     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 4641     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4642     offset_vec += const_vec;
 
 4662     dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
 
 4664     LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4668     dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 4669     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 4672     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
 
 4673     out = (v16u8) __msa_pckev_b((v16i8) 
tmp, (v16i8) 
tmp);
 
 4679                                      const int16_t *src1_ptr,
 
 4683                                      const int8_t *filter_x,
 
 4684                                      const int8_t *filter_y,
 
 4694     v8i16 in0 = { 0 }, in1 = { 0 };
 
 4697     v8i16 filt_h0, filt_h1;
 
 4700     v8i16 filter_vec, weight_vec;
 
 4701     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4702     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 4703     v8i16 dst30, dst41, dst52, dst63;
 
 4704     v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
 
 4705     v4i32 offset_vec, rnd_vec, const_vec;
 
 4706     v4i32 dst0, dst1, dst2, dst3;
 
 4708     src0_ptr -= (src_stride + 1);
 
 4710     filter_vec = 
LD_SH(filter_x);
 
 4713     filter_vec = 
LD_SH(filter_y);
 
 4720     offset = (offset0 + offset1) << rnd_val;
 
 4721     weight0 = weight0 & 0x0000FFFF;
 
 4722     weight = weight0 | (weight1 << 16);
 
 4724     const_vec = __msa_fill_w((128 * weight1));
 
 4726     offset_vec = __msa_fill_w(
offset);
 
 4727     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 4728     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4729     offset_vec += const_vec;
 
 4737     VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
 
 4751     SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 4754     LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4756     src1_ptr += (2 * src2_stride);
 
 4757     LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4763     dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 4764     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 4765     dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 4766     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 4770     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 4776                                              const int16_t *src1_ptr,
 
 4780                                              const int8_t *filter_x,
 
 4781                                              const int8_t *filter_y,
 
 4793     v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
 
 4794     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4796     v8i16 filt_h0, filt_h1;
 
 4797     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4800     v8i16 filter_vec, weight_vec;
 
 4801     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 4802     v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
 
 4803     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 4804     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 4805     v8i16 dst98_r, dst109_r;
 
 4806     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4807     v4i32 offset_vec, rnd_vec, const_vec;
 
 4809     src0_ptr -= (src_stride + 1);
 
 4811     filter_vec = 
LD_SH(filter_x);
 
 4814     filter_vec = 
LD_SH(filter_y);
 
 4821     offset = (offset0 + offset1) << rnd_val;
 
 4822     weight0 = weight0 & 0x0000FFFF;
 
 4823     weight = weight0 | (weight1 << 16);
 
 4825     const_vec = __msa_fill_w((128 * weight1));
 
 4827     offset_vec = __msa_fill_w(
offset);
 
 4828     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 4829     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 4830     offset_vec += const_vec;
 
 4833     src0_ptr += (3 * src_stride);
 
 4841     dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
 
 4843     for (loop_cnt = 
height >> 3; loop_cnt--;) {
 
 4844         LD_SB8(src0_ptr, src_stride,
 
 4845                src3, src4, src5, src6, src7, src8, src9, src10);
 
 4846         src0_ptr += (8 * src_stride);
 
 4848         VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
 
 4849         VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
 
 4850         VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
 
 4851         VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
 
 4858         dst32_r = __msa_ilvr_h(dst73, dst22);
 
 4862         dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
 
 4863         dst76_r = __msa_ilvr_h(dst22, dst106);
 
 4865         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4866         src1_ptr += 2 * src2_stride;
 
 4868         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4869         src1_ptr += 2 * src2_stride;
 
 4872         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4873         src1_ptr += 2 * src2_stride;
 
 4875         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 4876         src1_ptr += 2 * src2_stride;
 
 4887         SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 4888         SRA_4V(dst4, dst5, dst6, dst7, 6);
 
 4889         PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
 
 4895         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 4896         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 4897         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 4898         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 4899         dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 4900         dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 4901         dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 4902         dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 4907         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
 
 4910         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 4911         dst += (8 * dst_stride);
 
 4915         dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
 
 4921                                     const int16_t *src1_ptr,
 
 4925                                     const int8_t *filter_x,
 
 4926                                     const int8_t *filter_y,
 
 4936                                  dst, dst_stride, filter_x, filter_y,
 
 4937                                  weight0, weight1, offset0, offset1, rnd_val);
 
 4938     } 
else if (4 == 
height) {
 
 4940                                  dst, dst_stride, filter_x, filter_y,
 
 4941                                  weight0, weight1, offset0, offset1, rnd_val);
 
 4942     } 
else if (0 == (
height % 8)) {
 
 4944                                          src1_ptr, src2_stride,
 
 4945                                          dst, dst_stride, filter_x, filter_y,
 
 4946                                          height, weight0, weight1,
 
 4947                                          offset0, offset1, rnd_val);
 
 4953                                     const int16_t *src1_ptr,
 
 4957                                     const int8_t *filter_x,
 
 4958                                     const int8_t *filter_y,
 
 4966     uint32_t tpw0, tpw1, tpw2, tpw3;
 
 4969     v16u8 out0, out1, out2;
 
 4970     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4971     v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
 
 4972     v8i16 in4 = { 0 }, in5 = { 0 };
 
 4974     v8i16 filt_h0, filt_h1, filter_vec;
 
 4975     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4978     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
 
 4979     v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
 
 4980     v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
 
 4981     v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
 
 4982     v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
 
 4983     v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
 
 4984     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 4985     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4986     v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
 
 4987     v4i32 offset_vec, rnd_vec, const_vec;
 
 4989     src0_ptr -= (src_stride + 1);
 
 4991     filter_vec = 
LD_SH(filter_x);
 
 4994     filter_vec = 
LD_SH(filter_y);
 
 5001     offset = (offset0 + offset1) << rnd_val;
 
 5002     weight0 = weight0 & 0x0000FFFF;
 
 5003     weight = weight0 | (weight1 << 16);
 
 5005     const_vec = __msa_fill_w((128 * weight1));
 
 5007     offset_vec = __msa_fill_w(
offset);
 
 5008     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 5009     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 5010     offset_vec += const_vec;
 
 5013     src0_ptr += (3 * src_stride);
 
 5026     LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
 
 5030     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 5031     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 5032     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 5033     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 5040     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 5041     VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
 
 5042     VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
 
 5043     VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
 
 5058     PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
 
 5059     PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
 
 5060     dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
 
 5073     dst3_l = 
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
 
 5074     SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
 
 5075     SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
 
 5076     SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
 
 5077     PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
 
 5078     PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
 
 5080     LD2(src1_ptr, src2_stride, tp0, tp1);
 
 5082     LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
 
 5085     LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
 
 5087     LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
 
 5094     dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5095     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5096     dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5097     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5098     dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 5099     dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 5100     dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 5101     dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 5106     PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
 
 5109     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 5111     PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
 
 5113     LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
 
 5114     src1_ptr += (4 * src2_stride);
 
 5116     LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
 
 5122     dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5123     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5124     dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5125     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5130     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
 
 5131     ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, 
dst + 4, dst_stride);
 
 5136                                      const int16_t *src1_ptr,
 
 5140                                      const int8_t *filter_x,
 
 5141                                      const int8_t *filter_y,
 
 5152     v8i16 filt_h0, filt_h1;
 
 5155     v8i16 filter_vec, weight_vec;
 
 5156     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
 
 5157     v8i16 dst0, dst1, dst2, dst3, dst4;
 
 5159     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 5160     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 5161     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 5162     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 5163     v4i32 offset_vec, rnd_vec, const_vec;
 
 5165     src0_ptr -= (src_stride + 1);
 
 5167     filter_vec = 
LD_SH(filter_x);
 
 5170     filter_vec = 
LD_SH(filter_y);
 
 5177     offset = (offset0 + offset1) << rnd_val;
 
 5178     weight0 = weight0 & 0x0000FFFF;
 
 5179     weight = weight0 | (weight1 << 16);
 
 5181     const_vec = __msa_fill_w((128 * weight1));
 
 5183     offset_vec = __msa_fill_w(
offset);
 
 5184     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 5185     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 5186     offset_vec += const_vec;
 
 5191     LD_SH2(src1_ptr, src2_stride, in0, in1);
 
 5196     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 5197     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
 
 5213     SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 5214     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
 
 5219     dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5220     dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5221     dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5222     dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5223     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
 
 5225     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
 
 5226     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 5232                                          const int16_t *src1_ptr,
 
 5236                                          const int8_t *filter_x,
 
 5237                                          const int8_t *filter_y,
 
 5248     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, mask0, mask1;
 
 5249     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 5250     v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
 
 5251     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
 
 5252     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
 
 5253     v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
 
 5254     v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
 
 5255     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 5256     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 5257     v4i32 offset_vec, rnd_vec, const_vec;
 
 5259     src0_ptr -= (src_stride + 1);
 
 5261     filter_vec = 
LD_SH(filter_x);
 
 5264     filter_vec = 
LD_SH(filter_y);
 
 5272     offset = (offset0 + offset1) << rnd_val;
 
 5273     weight0 = weight0 & 0x0000FFFF;
 
 5274     weight = weight0 | (weight1 << 16);
 
 5276     const_vec = __msa_fill_w((128 * weight1));
 
 5278     offset_vec = __msa_fill_w(
offset);
 
 5279     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 5280     offset_vec += const_vec;
 
 5281     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 5283     for (cnt = width8mult; cnt--;) {
 
 5288         LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
 
 5302         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 5303         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 5304         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 5305         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 5326         SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 5327         SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 5328         PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
 
 5329                     dst3_r, dst0, dst1, dst2, dst3);
 
 5335         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5336         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5337         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5338         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5339         dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 5340         dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 5341         dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 5342         dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 5347         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 5348                     tmp0, tmp1, tmp2, tmp3);
 
 5350         ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
 5357                                      const int16_t *src1_ptr,
 
 5361                                      const int8_t *filter_x,
 
 5362                                      const int8_t *filter_y,
 
 5370     v16u8 out0, out1, out2;
 
 5371     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 5373     v8i16 filt_h0, filt_h1;
 
 5376     v8i16 filter_vec, weight_vec;
 
 5377     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
 
 5378     v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
 
 5379     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
 
 5380     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 5381     v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
 
 5382     v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
 
 5383     v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
 
 5384     v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
 
 5385     v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
 
 5386     v8i16 in0, in1, in2, in3, in4, in5;
 
 5387     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 5388     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 5389     v4i32 offset_vec, rnd_vec, const_vec;
 
 5391     src0_ptr -= (src_stride + 1);
 
 5393     filter_vec = 
LD_SH(filter_x);
 
 5396     filter_vec = 
LD_SH(filter_y);
 
 5403     offset = (offset0 + offset1) << rnd_val;
 
 5404     weight0 = weight0 & 0x0000FFFF;
 
 5405     weight = weight0 | (weight1 << 16);
 
 5407     const_vec = __msa_fill_w((128 * weight1));
 
 5409     offset_vec = __msa_fill_w(
offset);
 
 5410     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 5411     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 5412     offset_vec += const_vec;
 
 5415     src0_ptr += (5 * src_stride);
 
 5416     LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
 
 5421     LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
 
 5426     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 5427     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
 
 5428     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
 
 5429     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
 
 5430     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
 
 5431     VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
 
 5465     SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 5466     SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 5467     SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
 
 5468     PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
 
 5469                 dst0, dst1, dst2, dst3);
 
 5475     dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5476     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5477     dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5478     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5479     dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 5480     dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 5481     dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 5482     dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 5487     PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 5488                 tmp0, tmp1, tmp2, tmp3);
 
 5491     PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
 
 5494     dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5495     dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5496     dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5497     dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5501     out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
 
 5502     ST_D4(out0, out1, 0, 1, 0, 1, 
dst, dst_stride);
 
 5503     ST_D2(out2, 0, 1, 
dst + 4 * dst_stride, dst_stride);
 
 5508                                              const int16_t *src1_ptr,
 
 5512                                              const int8_t *filter_x,
 
 5513                                              const int8_t *filter_y,
 
 5525     const uint8_t *src0_ptr_tmp;
 
 5526     const int16_t *src1_ptr_tmp;
 
 5530     v8i16 in0, in1, in2, in3;
 
 5532     v8i16 filt_h0, filt_h1;
 
 5536     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 5537     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
 
 5538     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 5539     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 5540     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 5541     v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
 
 5542     v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
 
 5543     v4i32 offset_vec, rnd_vec, const_vec;
 
 5545     src0_ptr -= (src_stride + 1);
 
 5547     filter_vec = 
LD_SH(filter_x);
 
 5550     filter_vec = 
LD_SH(filter_y);
 
 5557     offset = (offset0 + offset1) << rnd_val;
 
 5558     weight0 = weight0 & 0x0000FFFF;
 
 5559     weight = weight0 | (weight1 << 16);
 
 5561     const_vec = __msa_fill_w((128 * weight1));
 
 5563     offset_vec = __msa_fill_w(
offset);
 
 5564     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 5565     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 5566     offset_vec += const_vec;
 
 5568     for (cnt = 
width >> 3; cnt--;) {
 
 5569         src0_ptr_tmp = src0_ptr;
 
 5570         src1_ptr_tmp = src1_ptr;
 
 5574         src0_ptr_tmp += (3 * src_stride);
 
 5587         for (loop_cnt = 
height >> 2; loop_cnt--;) {
 
 5588             LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
 
 5589             src0_ptr_tmp += (4 * src_stride);
 
 5590             LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
 
 5591             src1_ptr_tmp += (4 * src2_stride);
 
 5594             VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 5595             VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 5596             VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 5597             VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 5618             SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 5619             SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 5620             PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
 
 5621                         dst3_r, dst0, dst1, dst2, dst3);
 
 5626             dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5627             dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5628             dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5629             dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5630             dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 5631             dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 5632             dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 5633             dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 5638             PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 5639                         tmp0, tmp1, tmp2, tmp3);
 
 5641             ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
 
 5642             dst_tmp += (4 * dst_stride);
 
 5659                                     const int16_t *src1_ptr,
 
 5663                                     const int8_t *filter_x,
 
 5664                                     const int8_t *filter_y,
 
 5674                                  dst, dst_stride, filter_x, filter_y,
 
 5675                                  weight0, weight1, offset0, offset1, rnd_val);
 
 5676     } 
else if (4 == 
height) {
 
 5678                                      src2_stride, 
dst, dst_stride, filter_x,
 
 5679                                      filter_y, weight0, weight1, offset0,
 
 5680                                      offset1, rnd_val, 1);
 
 5681     } 
else if (6 == 
height) {
 
 5683                                  dst, dst_stride, filter_x, filter_y,
 
 5684                                  weight0, weight1, offset0, offset1, rnd_val);
 
 5685     } 
else if (0 == (
height % 4)) {
 
 5687                                          src1_ptr, src2_stride,
 
 5688                                          dst, dst_stride, filter_x, filter_y,
 
 5690                                          weight1, offset0, offset1, rnd_val, 8);
 
 5696                                      const int16_t *src1_ptr,
 
 5700                                      const int8_t *filter_x,
 
 5701                                      const int8_t *filter_y,
 
 5712     const uint8_t *src0_ptr_tmp;
 
 5713     const int16_t *src1_ptr_tmp;
 
 5716     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 5717     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 5718     v16i8 mask0, mask1, mask2, mask3;
 
 5719     v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
 
 5720     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 5721     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
 
 5722     v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
 
 5723     v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
 
 5724     v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
 
 5725     v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
 
 5726     v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
 
 5727     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 5728     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 5729     v4i32 offset_vec, rnd_vec, const_vec;
 
 5731     src0_ptr -= (src_stride + 1);
 
 5733     filter_vec = 
LD_SH(filter_x);
 
 5736     filter_vec = 
LD_SH(filter_y);
 
 5744     offset = (offset0 + offset1) << rnd_val;
 
 5745     weight0 = weight0 & 0x0000FFFF;
 
 5746     weight = weight0 | (weight1 << 16);
 
 5748     const_vec = __msa_fill_w((128 * weight1));
 
 5750     offset_vec = __msa_fill_w(
offset);
 
 5751     rnd_vec = __msa_fill_w(rnd_val + 1);
 
 5752     offset_vec += const_vec;
 
 5753     weight_vec = (v8i16) __msa_fill_w(
weight);
 
 5755     src0_ptr_tmp = src0_ptr;
 
 5757     src1_ptr_tmp = src1_ptr;
 
 5760     src0_ptr_tmp += (3 * src_stride);
 
 5775     for (loop_cnt = 4; loop_cnt--;) {
 
 5776         LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
 
 5777         src0_ptr_tmp += (4 * src_stride);
 
 5780         LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
 
 5781         src1_ptr_tmp += (4 * src2_stride);
 
 5783         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 5784         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 5785         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 5786         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 5807         SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 5808         SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 5809         PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
 
 5810                     dst3_r, dst0, dst1, dst2, dst3);
 
 5815         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5816         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5817         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5818         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5819         dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 5820         dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 5821         dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 5822         dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 5827         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 5828                     tmp0, tmp1, tmp2, tmp3);
 
 5830         ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
 
 5831         dst_tmp += (4 * dst_stride);
 
 5848     src0_ptr += (3 * src_stride);
 
 5857     dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
 
 5859     for (loop_cnt = 2; loop_cnt--;) {
 
 5860         LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
 
 5862         src0_ptr += (8 * src_stride);
 
 5864         VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
 
 5865         VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
 
 5866         VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
 
 5867         VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
 
 5874         dst32_r = __msa_ilvr_h(dst73, dst22);
 
 5878         dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
 
 5879         dst76_r = __msa_ilvr_h(dst22, dst106);
 
 5881         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 5882         src1_ptr += 2 * src2_stride;
 
 5884         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 5885         src1_ptr += 2 * src2_stride;
 
 5888         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 5889         src1_ptr += 2 * src2_stride;
 
 5891         LD2(src1_ptr, src2_stride, tp0, tp1);
 
 5892         src1_ptr += 2 * src2_stride;
 
 5904         SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 5905         SRA_4V(dst4, dst5, dst6, dst7, 6);
 
 5906         PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 5907                     dst0, dst1, dst2, dst3);
 
 5912         dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
 
 5913         dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
 
 5914         dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
 
 5915         dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
 
 5916         dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
 
 5917         dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
 
 5918         dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
 
 5919         dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
 
 5924         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 5925                     tmp0, tmp1, tmp2, tmp3);
 
 5927         ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, 
dst, dst_stride);
 
 5928         dst += (8 * dst_stride);
 
 5932         dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
 
 5938                                      const int16_t *src1_ptr,
 
 5942                                      const int8_t *filter_x,
 
 5943                                      const int8_t *filter_y,
 
 5953                                      src2_stride, 
dst, dst_stride, filter_x,
 
 5954                                      filter_y, weight0, weight1, offset0,
 
 5955                                      offset1, rnd_val, 2);
 
 5958                                          src2_stride, 
dst, dst_stride,
 
 5959                                          filter_x, filter_y, 
height, weight0,
 
 5960                                          weight1, offset0, offset1, rnd_val, 16);
 
 5966                                      const int16_t *src1_ptr,
 
 5970                                      const int8_t *filter_x,
 
 5971                                      const int8_t *filter_y,
 
 5980                                      src1_ptr, src2_stride,
 
 5982                                      filter_x, filter_y, 
height, weight0,
 
 5983                                      weight1, offset0, offset1, rnd_val, 24);
 
 5988                                      const int16_t *src1_ptr,
 
 5992                                      const int8_t *filter_x,
 
 5993                                      const int8_t *filter_y,
 
 6002                                      src1_ptr, src2_stride,
 
 6004                                      filter_x, filter_y, 
height, weight0,
 
 6005                                      weight1, offset0, offset1, rnd_val, 32);
 
 6008 #define BI_W_MC_COPY(WIDTH)                                                  \ 
 6009 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,           \ 
 6010                                                      ptrdiff_t dst_stride,   \ 
 6011                                                      const uint8_t *src,     \ 
 6012                                                      ptrdiff_t src_stride,   \ 
 6013                                                      const int16_t *src_16bit, \ 
 6024     int shift = 14 + 1 - 8;                                                  \ 
 6025     int log2Wd = denom + shift - 1;                                          \ 
 6027     hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \ 
 6028                                    dst, dst_stride, height,                  \ 
 6029                                    weight0, weight1, offset0,                \ 
 6045 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                         \ 
 6046 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,         \ 
 6049                                                         const uint8_t *src,   \ 
 6052                                                         const int16_t *src_16bit, \ 
 6063     const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR];                 \ 
 6064     int log2Wd = denom + 14 - 8;                                              \ 
 6066     hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,   \ 
 6067                                                 MAX_PB_SIZE, dst, dst_stride, \ 
 6068                                                 filter, height, weight0,      \ 
 6069                                                 weight1, offset0, offset1,    \ 
 6109 #define BI_W_MC_HV(PEL, WIDTH, TAP)                                         \ 
 6110 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \ 
 6111                                                      ptrdiff_t dst_stride,  \ 
 6112                                                      const uint8_t *src,    \ 
 6113                                                      ptrdiff_t src_stride,  \ 
 6114                                                      const int16_t *src_16bit, \ 
 6125     const int8_t *filter_x = ff_hevc_##PEL##_filters[mx];                   \ 
 6126     const int8_t *filter_y = ff_hevc_##PEL##_filters[my];                   \ 
 6127     int log2Wd = denom + 14 - 8;                                            \ 
 6129     hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \ 
 6130                                           MAX_PB_SIZE, dst, dst_stride,     \ 
 6131                                           filter_x, filter_y, height,       \ 
 6132                                           weight0, weight1, offset0,        \