27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   29     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
 
   31     8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 
   52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \ 
   53                             filt0, filt1, filt2, filt3)         \ 
   57     tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \ 
   58     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \ 
   59     tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \ 
   60     tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \ 
   61     tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \ 
   66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \ 
   67                         filt_h0, filt_h1, filt_h2, filt_h3)              \ 
   69     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \ 
   72     VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \ 
   73                vec0_m, vec1_m, vec2_m, vec3_m);                          \ 
   74     hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \ 
   75                                    filt_h0, filt_h1, filt_h2, filt_h3);  \ 
   77     hz_out_m = __msa_srari_h(hz_out_m, 7);                               \ 
   78     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \ 
   83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \ 
   84                                    mask0, mask1, mask2, mask3,              \ 
   85                                    filt0, filt1, filt2, filt3,              \ 
   88     v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \ 
   89     v8i16 res0_m, res1_m, res2_m, res3_m;                                   \ 
   91     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \ 
   92     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \ 
   93     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \ 
   94     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \ 
   95     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \ 
   96     DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \ 
   97     VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \ 
   98     DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \ 
   99     ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \ 
  102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \ 
  103                                    mask0, mask1, mask2, mask3,                \ 
  104                                    filt0, filt1, filt2, filt3,                \ 
  105                                    out0, out1, out2, out3)                    \ 
  107     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \ 
  108     v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \ 
  110     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \ 
  111     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \ 
  112     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \ 
  113                 res0_m, res1_m, res2_m, res3_m);                              \ 
  114     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \ 
  115     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \ 
  116     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \ 
  117                 res4_m, res5_m, res6_m, res7_m);                              \ 
  118     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \ 
  119     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \ 
  120     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \ 
  121                  res0_m, res1_m, res2_m, res3_m);                             \ 
  122     VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \ 
  123     VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \ 
  124     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \ 
  125                  res4_m, res5_m, res6_m, res7_m);                             \ 
  126     ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \ 
  127                 res7_m, out0, out1, out2, out3);                              \ 
  130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \ 
  134     tmp_m = PCKEV_XORI128_UB(in1, in0);               \ 
  135     tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \ 
  136     ST_UB(tmp_m, (pdst));                             \ 
  139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \ 
  143     tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \ 
  144     tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \ 
  145     ST_UB(tmp_m, (pdst));                                     \ 
  148 #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,  dst0, dst1,   \ 
  151     v16u8 tmp0_m, tmp1_m;                                     \ 
  152     uint8_t *pdst_m = (uint8_t *) (pdst);                     \ 
  154     PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);          \ 
  155     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \ 
  156     ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \ 
  160                                  uint8_t *dst, 
int32_t dst_stride,
 
  163     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  164     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  165     v8i16 
filt, out0, out1;
 
  181                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  185     ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
  189                                  uint8_t *dst, 
int32_t dst_stride,
 
  192     v16i8 filt0, filt1, filt2, filt3;
 
  194     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  195     v8i16 
filt, out0, out1, out2, out3;
 
  210     src += (4 * src_stride);
 
  212                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  216                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  220     ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
  222     ST_W4(
out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 
  226                                 uint8_t *dst, 
int32_t dst_stride,
 
  237                                  uint8_t *dst, 
int32_t dst_stride,
 
  240     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  241     v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
 
  242     v8i16 
filt, out0, out1, out2, out3;
 
  258                                mask3, filt0, filt1, filt2, filt3, out0, out1,
 
  264     ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
 
  268                                      uint8_t *dst, 
int32_t dst_stride,
 
  272     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  273     v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
 
  274     v8i16 
filt, out0, out1, out2, out3;
 
  287     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  290         src += (4 * src_stride);
 
  292                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  298         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
 
  299         dst += (4 * dst_stride);
 
  304                                 uint8_t *dst, 
int32_t dst_stride,
 
  316                                  uint8_t *dst, 
int32_t dst_stride,
 
  320     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  321     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  322     v8i16 
filt, out0, out1, out2, out3;
 
  335     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
  339         src += (2 * src_stride);
 
  341                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  355                                  uint8_t *dst, 
int32_t dst_stride,
 
  359     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  360     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  361     v8i16 
filt, out0, out1, out2, out3;
 
  374     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
  382                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  401                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  414                                  uint8_t *dst, 
int32_t dst_stride,
 
  418     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  419     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  420     v8i16 
filt, out0, out1, out2, out3;
 
  433     for (loop_cnt = 
height; loop_cnt--;) {
 
  441                                    mask2, mask3, filt0, filt1, filt2, filt3,
 
  442                                    out0, out1, out2, out3);
 
  458                                    mask2, mask3, filt0, filt1, filt2, filt3,
 
  459                                    out0, out1, out2, out3);
 
  471                                 uint8_t *dst, 
int32_t dst_stride,
 
  475     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  476     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  477     v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
 
  478     v16i8 src10998, filt0, filt1, filt2, filt3;
 
  480     v8i16 
filt, out10, out32;
 
  482     src -= (3 * src_stride);
 
  488     src += (7 * src_stride);
 
  490     ILVR_B4_SB(
src1, 
src0, src3, src2, src5, src4, src2, 
src1, src10_r, src32_r,
 
  492     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  493     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
 
  497     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  498         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
  499         src += (4 * src_stride);
 
  501         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  502                    src87_r, src98_r, src109_r);
 
  503         ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
 
  506                                     filt1, filt2, filt3);
 
  508                                     filt1, filt2, filt3);
 
  512         ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
  513         dst += (4 * dst_stride);
 
  523                                 uint8_t *dst, 
int32_t dst_stride,
 
  527     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  528     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  529     v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
 
  531     v8i16 
filt, out0_r, out1_r, out2_r, out3_r;
 
  533     src -= (3 * src_stride);
 
  540     src += (7 * src_stride);
 
  541     ILVR_B4_SB(
src1, 
src0, src3, src2, src5, src4, src2, 
src1, src10_r, src32_r,
 
  543     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  545     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  546         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
  548         src += (4 * src_stride);
 
  550         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  551                    src87_r, src98_r, src109_r);
 
  553                                      filt1, filt2, filt3);
 
  555                                      filt1, filt2, filt3);
 
  557                                      filt1, filt2, filt3);
 
  559                                      filt1, filt2, filt3);
 
  561         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
  564         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
 
  565         dst += (4 * dst_stride);
 
  578                                  uint8_t *dst, 
int32_t dst_stride,
 
  582     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  583     v16i8 filt0, filt1, filt2, filt3;
 
  584     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  585     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
  586     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
  587     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  588     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
  590     src -= (3 * src_stride);
 
  597     src += (7 * src_stride);
 
  598     ILVR_B4_SB(
src1, 
src0, src3, src2, src5, src4, src2, 
src1, src10_r, src32_r,
 
  600     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  601     ILVL_B4_SB(
src1, 
src0, src3, src2, src5, src4, src2, 
src1, src10_l, src32_l,
 
  603     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
  605     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  606         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
  608         src += (4 * src_stride);
 
  610         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  611                    src87_r, src98_r, src109_r);
 
  612         ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
  613                    src87_l, src98_l, src109_l);
 
  615                                      filt1, filt2, filt3);
 
  617                                      filt1, filt2, filt3);
 
  619                                      filt1, filt2, filt3);
 
  621                                      filt1, filt2, filt3);
 
  623                                      filt1, filt2, filt3);
 
  625                                      filt1, filt2, filt3);
 
  627                                      filt1, filt2, filt3);
 
  629                                      filt1, filt2, filt3);
 
  632         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
  633         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
  634         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
  635                     out3_r, tmp0, tmp1, tmp2, tmp3);
 
  637         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 
  638         dst += (4 * dst_stride);
 
  657                                       uint8_t *dst, 
int32_t dst_stride,
 
  661     const uint8_t *src_tmp;
 
  663     uint32_t loop_cnt, cnt;
 
  664     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  665     v16i8 filt0, filt1, filt2, filt3;
 
  666     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  667     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
  668     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
  669     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  670     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
  672     src -= (3 * src_stride);
 
  677     for (cnt = (
width >> 4); cnt--;) {
 
  681         LD_SB7(src_tmp, src_stride, 
src0, 
src1, src2, src3, src4, src5, src6);
 
  683         src_tmp += (7 * src_stride);
 
  685                    src32_r, src54_r, src21_r);
 
  686         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  688                    src32_l, src54_l, src21_l);
 
  689         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
  691         for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  692             LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
 
  694             src_tmp += (4 * src_stride);
 
  695             ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  696                        src87_r, src98_r, src109_r);
 
  697             ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
  698                        src87_l, src98_l, src109_l);
 
  700                                          filt0, filt1, filt2, filt3);
 
  702                                          filt0, filt1, filt2, filt3);
 
  704                                          filt0, filt1, filt2, filt3);
 
  706                                          filt0, filt1, filt2, filt3);
 
  708                                          filt0, filt1, filt2, filt3);
 
  710                                          filt0, filt1, filt2, filt3);
 
  712                                          filt0, filt1, filt2, filt3);
 
  714                                          filt0, filt1, filt2, filt3);
 
  717             SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
  718             SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
  719             PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
  720                         out3_r, tmp0, tmp1, tmp2, tmp3);
 
  722             ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
 
  723             dst_tmp += (4 * dst_stride);
 
  746                                  uint8_t *dst, 
int32_t dst_stride,
 
  754                                  uint8_t *dst, 
int32_t dst_stride,
 
  762                                      uint8_t *dst, 
int32_t dst_stride,
 
  763                                      const int8_t *filter_horiz,
 
  764                                      const int8_t *filter_vert,
 
  768     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  769     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
  770     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  771     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
  772     v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
 
  773     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
  776     src -= (3 + 3 * src_stride);
 
  788     src += (7 * src_stride);
 
  791                               filt_hz1, filt_hz2, filt_hz3);
 
  792     hz_out2 = 
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
  793                               filt_hz1, filt_hz2, filt_hz3);
 
  794     hz_out4 = 
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
  795                               filt_hz1, filt_hz2, filt_hz3);
 
  796     hz_out5 = 
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
  797                               filt_hz1, filt_hz2, filt_hz3);
 
  798     SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
 
  803     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
 
  804     out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
 
  806     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  807         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
  809         src += (4 * src_stride);
 
  812                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  813         hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
 
  814         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
  819                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  820         hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
 
  821         out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
  827         ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
  828         dst += (4 * dst_stride);
 
  838                                      uint8_t *dst, 
int32_t dst_stride,
 
  839                                      const int8_t *filter_horiz,
 
  840                                      const int8_t *filter_vert,
 
  844     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  845     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
  846     v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
 
  847     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
  848     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
  849     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
 
  850     v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
 
  853     src -= (3 + 3 * src_stride);
 
  864     src += (7 * src_stride);
 
  868                               filt_hz1, filt_hz2, filt_hz3);
 
  870                               filt_hz1, filt_hz2, filt_hz3);
 
  871     hz_out2 = 
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
 
  872                               filt_hz1, filt_hz2, filt_hz3);
 
  873     hz_out3 = 
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
  874                               filt_hz1, filt_hz2, filt_hz3);
 
  875     hz_out4 = 
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
 
  876                               filt_hz1, filt_hz2, filt_hz3);
 
  877     hz_out5 = 
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
  878                               filt_hz1, filt_hz2, filt_hz3);
 
  879     hz_out6 = 
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
  880                               filt_hz1, filt_hz2, filt_hz3);
 
  885     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
 
  886     ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
 
  887     ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
 
  889     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
  890         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
  891         src += (4 * src_stride);
 
  896                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  897         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
  902                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  903         out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
 
  908                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  909         out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
  911                                    filt_vt1, filt_vt2, filt_vt3);
 
  914                                    filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  915         out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
 
  922         ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
 
  923         dst += (4 * dst_stride);
 
  936                                       uint8_t *dst, 
int32_t dst_stride,
 
  937                                       const int8_t *filter_horiz,
 
  938                                       const int8_t *filter_vert,
 
  943     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
  953                                       uint8_t *dst, 
int32_t dst_stride,
 
  954                                       const int8_t *filter_horiz,
 
  955                                       const int8_t *filter_vert,
 
  960     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
  970                                       uint8_t *dst, 
int32_t dst_stride,
 
  971                                       const int8_t *filter_horiz,
 
  972                                       const int8_t *filter_vert,
 
  977     for (multiple8_cnt = 8; multiple8_cnt--;) {
 
  988                                               uint8_t *dst, 
int32_t dst_stride,
 
  991     uint32_t tp0, tp1, tp2, tp3;
 
  992     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  994     v16u8 mask0, mask1, mask2, mask3;
 
  995     v8i16 
filt, res0, res1;
 
 1011                                mask3, filt0, filt1, filt2, filt3, res0, res1);
 
 1012     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1017     res = (v16u8) __msa_aver_u_b(res, dst0);
 
 1018     ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
 
 1023                                               uint8_t *dst, 
int32_t dst_stride,
 
 1026     uint32_t tp0, tp1, tp2, tp3;
 
 1027     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1028     v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
 
 1030     v8i16 
filt, vec0, vec1, vec2, vec3;
 
 1045     src += (4 * src_stride);
 
 1046     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1048     LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
 
 1051                                mask3, filt0, filt1, filt2, filt3, vec0, vec1);
 
 1055                                mask3, filt0, filt1, filt2, filt3, vec2, vec3);
 
 1058     PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
 
 1059                 res0, res1, res2, res3);
 
 1060     ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
 
 1063     ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
 1068                                              uint8_t *dst, 
int32_t dst_stride,
 
 1075     } 
else if (8 == 
height) {
 
 1083                                              uint8_t *dst, 
int32_t dst_stride,
 
 1088     int64_t tp0, tp1, tp2, tp3;
 
 1089     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1090     v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
 
 1091     v8i16 
filt, out0, out1, out2, out3;
 
 1104     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1107         src += (4 * src_stride);
 
 1109                                    mask3, filt0, filt1, filt2, filt3, out0,
 
 1111         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1118         dst += (4 * dst_stride);
 
 1124                                               uint8_t *dst, 
int32_t dst_stride,
 
 1129     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1130     v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
 
 1131     v8i16 
filt, out0, out1, out2, out3;
 
 1132     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1133     v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1146     for (loop_cnt = 
height >> 1; loop_cnt--;) {
 
 1149         src += (2 * src_stride);
 
 1156         VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
 
 1158         VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
 
 1160         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 1162         DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
 
 1163                     vec9, vec10, vec11);
 
 1164         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
 
 1166         DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
 
 1167                      vec8, vec9, vec10, vec11);
 
 1168         ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
 
 1170         LD_UB2(dst, dst_stride, dst0, dst1);
 
 1182                                               uint8_t *dst, 
int32_t dst_stride,
 
 1187     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1188     v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
 
 1189     v8i16 
filt, out0, out1, out2, out3;
 
 1190     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1191     v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1204     for (loop_cnt = 
height; loop_cnt--;) {
 
 1208         src1 = __msa_sldi_b(src2, 
src0, 8);
 
 1216         VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
 
 1218         VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
 
 1220         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 1222         DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
 
 1223                     vec9, vec10, vec11);
 
 1224         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
 
 1226         DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
 
 1227                      vec8, vec9, vec10, vec11);
 
 1228         ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
 
 1232         LD_UB2(dst, 16, dst1, dst2);
 
 1241                                               uint8_t *dst, 
int32_t dst_stride,
 
 1245     uint32_t loop_cnt, cnt;
 
 1246     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1247     v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
 
 1248     v8i16 
filt, out0, out1, out2, out3;
 
 1249     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1250     v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1263     for (loop_cnt = 
height; loop_cnt--;) {
 
 1264         for (cnt = 0; cnt < 2; ++cnt) {
 
 1266             src2 = 
LD_SB(&
src[16 + (cnt << 5)]);
 
 1267             src3 = 
LD_SB(&
src[24 + (cnt << 5)]);
 
 1268             src1 = __msa_sldi_b(src2, 
src0, 8);
 
 1275             VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
 
 1277             VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
 
 1279             DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1280                         vec0, vec1, vec2, vec3);
 
 1281             DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
 
 1282                         vec8, vec9, vec10, vec11);
 
 1283             DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
 
 1284                          vec0, vec1, vec2, vec3);
 
 1285             DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
 
 1286                          vec8, vec9, vec10, vec11);
 
 1287             ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
 
 1291             LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
 
 1303                                              uint8_t *dst, 
int32_t dst_stride,
 
 1308     uint32_t tp0, tp1, tp2, tp3;
 
 1309     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1311     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1312     v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
 
 1313     v16i8 src10998, filt0, filt1, filt2, filt3;
 
 1314     v8i16 
filt, out10, out32;
 
 1316     src -= (3 * src_stride);
 
 1322     src += (7 * src_stride);
 
 1324     ILVR_B4_SB(
src1, 
src0, src3, src2, src5, src4, src2, 
src1, src10_r, src32_r,
 
 1326     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1327     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
 
 1331     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1332         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
 1333         src += (4 * src_stride);
 
 1335         LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1337         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1338                    src87_r, src98_r, src109_r);
 
 1339         ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
 
 1342                                     filt1, filt2, filt3);
 
 1344                                     filt1, filt2, filt3);
 
 1348         out = __msa_aver_u_b(
out, dst0);
 
 1350         ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
 1351         dst += (4 * dst_stride);
 
 1362                                              uint8_t *dst, 
int32_t dst_stride,
 
 1367     uint64_t tp0, tp1, tp2, tp3;
 
 1368     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1370     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1371     v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
 
 1372     v8i16 
filt, out0, out1, out2, out3;
 
 1374     src -= (3 * src_stride);
 
 1380     src += (7 * src_stride);
 
 1383     ILVR_B4_SB(
src1, 
src0, src3, src2, src5, src4, src2, 
src1, src10_r, src32_r,
 
 1385     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1387     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1388         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
 1389         src += (4 * src_stride);
 
 1391         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1395         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1396                    src87_r, src98_r, src109_r);
 
 1398                                    filt1, filt2, filt3);
 
 1400                                    filt1, filt2, filt3);
 
 1402                                    filt1, filt2, filt3);
 
 1404                                    filt1, filt2, filt3);
 
 1409         dst += (4 * dst_stride);
 
 1429     const uint8_t *src_tmp;
 
 1431     uint32_t loop_cnt, cnt;
 
 1432     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1433     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1434     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
 1435     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
 1436     v16i8 filt0, filt1, filt2, filt3;
 
 1437     v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
 
 1438     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, 
filt;
 
 1440     src -= (3 * src_stride);
 
 1445     for (cnt = (
width >> 4); cnt--;) {
 
 1449         LD_SB7(src_tmp, src_stride, 
src0, 
src1, src2, src3, src4, src5, src6);
 
 1451         src_tmp += (7 * src_stride);
 
 1454                    src32_r, src54_r, src21_r);
 
 1455         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1457                    src32_l, src54_l, src21_l);
 
 1458         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1460         for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1461             LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
 
 1462             src_tmp += (4 * src_stride);
 
 1464             LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
 
 1466             ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1467                        src87_r, src98_r, src109_r);
 
 1468             ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
 1469                        src87_l, src98_l, src109_l);
 
 1471                                          filt0, filt1, filt2, filt3);
 
 1473                                          filt0, filt1, filt2, filt3);
 
 1475                                          filt0, filt1, filt2, filt3);
 
 1477                                          filt0, filt1, filt2, filt3);
 
 1479                                          filt0, filt1, filt2, filt3);
 
 1481                                          filt0, filt1, filt2, filt3);
 
 1483                                          filt0, filt1, filt2, filt3);
 
 1485                                          filt0, filt1, filt2, filt3);
 
 1488             SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1489             SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 1490             PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 1491                         out3_r, tmp0, tmp1, tmp2, tmp3);
 
 1493             AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
 
 1494                         dst0, dst1, dst2, dst3);
 
 1495             ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
 
 1496             dst_tmp += (4 * dst_stride);
 
 1520                                               uint8_t *dst, 
int32_t dst_stride,
 
 1530                                               uint8_t *dst, 
int32_t dst_stride,
 
 1540                                               uint8_t *dst, 
int32_t dst_stride,
 
 1552                                                   const int8_t *filter_horiz,
 
 1553                                                   const int8_t *filter_vert,
 
 1557     uint32_t tp0, tp1, tp2, tp3;
 
 1558     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1559     v16u8 dst0, res, mask0, mask1, mask2, mask3;
 
 1560     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
 1561     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1562     v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
 
 1563     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
 1566     src -= (3 + 3 * src_stride);
 
 1578     src += (7 * src_stride);
 
 1581                               filt_hz1, filt_hz2, filt_hz3);
 
 1582     hz_out2 = 
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
 1583                               filt_hz1, filt_hz2, filt_hz3);
 
 1584     hz_out4 = 
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
 1585                               filt_hz1, filt_hz2, filt_hz3);
 
 1586     hz_out5 = 
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
 1587                               filt_hz1, filt_hz2, filt_hz3);
 
 1588     SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
 
 1593     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 1594     vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
 
 1596     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1597         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
 1599         src += (4 * src_stride);
 
 1601         LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1604                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1605         hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
 
 1606         vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
 1608                                    filt_vt2, filt_vt3);
 
 1611                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1612         hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
 
 1613         vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
 1615                                    filt_vt2, filt_vt3);
 
 1620         res = (v16u8) __msa_aver_u_b(res, dst0);
 
 1621         ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
 
 1622         dst += (4 * dst_stride);
 
 1635                                                   const int8_t *filter_horiz,
 
 1636                                                   const int8_t *filter_vert,
 
 1640     uint64_t tp0, tp1, tp2, tp3;
 
 1641     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1642     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
 1643     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
 1644     v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
 
 1645     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1646     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
 
 1647     v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
 
 1650     src -= (3 + 3 * src_stride);
 
 1661     src += (7 * src_stride);
 
 1665                               filt_hz1, filt_hz2, filt_hz3);
 
 1667                               filt_hz1, filt_hz2, filt_hz3);
 
 1668     hz_out2 = 
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
 
 1669                               filt_hz1, filt_hz2, filt_hz3);
 
 1670     hz_out3 = 
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
 1671                               filt_hz1, filt_hz2, filt_hz3);
 
 1672     hz_out4 = 
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
 
 1673                               filt_hz1, filt_hz2, filt_hz3);
 
 1674     hz_out5 = 
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
 1675                               filt_hz1, filt_hz2, filt_hz3);
 
 1676     hz_out6 = 
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
 1677                               filt_hz1, filt_hz2, filt_hz3);
 
 1682     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
 
 1683     ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
 
 1684     ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
 
 1686     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 1687         LD_SB4(
src, src_stride, src7, src8, src9, src10);
 
 1689         src += (4 * src_stride);
 
 1691         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 1696                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1697         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
 1699                                    filt_vt2, filt_vt3);
 
 1702                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1703         out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
 
 1705                                    filt_vt2, filt_vt3);
 
 1708                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1709         out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
 1711                                    filt_vt2, filt_vt3);
 
 1714                                    filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1715         out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
 
 1717                                    filt_vt2, filt_vt3);
 
 1723         dst += (4 * dst_stride);
 
 1739                                                    const int8_t *filter_horiz,
 
 1740                                                    const int8_t *filter_vert,
 
 1745     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 1747                                               filter_horiz, filter_vert,
 
 1759                                                    const int8_t *filter_horiz,
 
 1760                                                    const int8_t *filter_vert,
 
 1765     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 1767                                               filter_horiz, filter_vert,
 
 1779                                                    const int8_t *filter_horiz,
 
 1780                                                    const int8_t *filter_vert,
 
 1785     for (multiple8_cnt = 8; multiple8_cnt--;) {
 
 1787                                               filter_horiz, filter_vert,
 
 1796                                  uint8_t *dst, 
int32_t dst_stride,
 
 1800     v16u8 filt0, vec0, vec1, res0, res1;
 
 1801     v8u16 vec2, vec3, 
filt;
 
 1807     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1811     DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
 
 1814     ST_W2(res0, 0, 1, dst, dst_stride);
 
 1815     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
 
 1819                                  uint8_t *dst, 
int32_t dst_stride,
 
 1822     v16u8 vec0, vec1, vec2, vec3, filt0;
 
 1823     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 1824     v16i8 res0, res1, res2, res3;
 
 1825     v8u16 vec4, vec5, vec6, vec7, 
filt;
 
 1831     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1836     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1837                 vec4, vec5, vec6, vec7);
 
 1839     PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
 
 1840                 res0, res1, res2, res3);
 
 1841     ST_W2(res0, 0, 1, dst, dst_stride);
 
 1842     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
 
 1843     ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
 
 1844     ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
 
 1848                          const uint8_t *
src, ptrdiff_t src_stride,
 
 1849                          int height, 
int mx, 
int my)
 
 1855     } 
else if (8 == 
height) {
 
 1861                                  uint8_t *dst, 
int32_t dst_stride,
 
 1866     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 1872     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1877     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1878                 vec0, vec1, vec2, vec3);
 
 1885                                      uint8_t *dst, 
int32_t dst_stride,
 
 1890     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 1896     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1899     src += (4 * src_stride);
 
 1903     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1904                 vec0, vec1, vec2, vec3);
 
 1907     src += (4 * src_stride);
 
 1910     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
 1914     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1915                 vec0, vec1, vec2, vec3);
 
 1918     ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
 
 1919     dst += (8 * dst_stride);
 
 1923         src += (4 * src_stride);
 
 1927         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1928                     vec0, vec1, vec2, vec3);
 
 1931         src += (4 * src_stride);
 
 1934         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
 1938         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1939                     vec0, vec1, vec2, vec3);
 
 1942         ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
 
 1947                          const uint8_t *
src, ptrdiff_t src_stride,
 
 1948                          int height, 
int mx, 
int my)
 
 1961                           const uint8_t *
src, ptrdiff_t src_stride,
 
 1962                           int height, 
int mx, 
int my)
 
 1966     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 1967     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1968     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 1972     loop_cnt = (
height >> 2) - 1;
 
 1976     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1980     src += (4 * src_stride);
 
 1986     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1987                 out0, out1, out2, out3);
 
 1988     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 1989                 out4, out5, out6, out7);
 
 2001     for (; loop_cnt--;) {
 
 2004         src += (4 * src_stride);
 
 2010         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2011                     out0, out1, out2, out3);
 
 2012         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2013                     out4, out5, out6, out7);
 
 2028                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2029                           int height, 
int mx, 
int my)
 
 2033     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2034     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2035     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 2041     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2043     for (loop_cnt = 
height >> 1; loop_cnt--;) {
 
 2047         src1 = __msa_sldi_b(src2, 
src0, 8);
 
 2052         src5 = __msa_sldi_b(src6, src4, 8);
 
 2059         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2060                     out0, out1, out2, out3);
 
 2061         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2062                     out4, out5, out6, out7);
 
 2075                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2076                           int height, 
int mx, 
int my)
 
 2080     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2081     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2082     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 2088     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2090     for (loop_cnt = 
height; loop_cnt--;) {
 
 2103         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2104                     out0, out1, out2, out3);
 
 2105         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2106                     out4, out5, out6, out7);
 
 2118                                  uint8_t *dst, 
int32_t dst_stride,
 
 2121     v16i8 
src0, 
src1, src2, src3, src4;
 
 2122     v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
 
 2128     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2131     src += (5 * src_stride);
 
 2134                src10_r, src21_r, src32_r, src43_r);
 
 2135     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 2136     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
 
 2139     src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 2140     ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
 
 2144                                  uint8_t *dst, 
int32_t dst_stride,
 
 2147     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 2148     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
 
 2149     v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
 
 2150     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2155     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2158     src += (8 * src_stride);
 
 2163     ILVR_B4_SB(
src1, 
src0, src2, 
src1, src3, src2, src4, src3, src10_r, src21_r,
 
 2165     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 2167     ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 
 2168                src87_r, src76_r, src2110, src4332, src6554, src8776);
 
 2169     DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
 
 2170                 tmp0, tmp1, tmp2, tmp3);
 
 2173     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
 
 2174     ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
 2178                          const uint8_t *
src, ptrdiff_t src_stride,
 
 2179                          int height, 
int mx, 
int my)
 
 2185     } 
else if (8 == 
height) {
 
 2191                                  uint8_t *dst, 
int32_t dst_stride,
 
 2194     v16u8 
src0, 
src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
 
 2196     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2201     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2205     ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
 
 2206     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2207                 tmp0, tmp1, tmp2, tmp3);
 
 2211     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
 2215                                      uint8_t *dst, 
int32_t dst_stride,
 
 2219     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 2220     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2222     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2227     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2232     for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
 2233         LD_UB8(
src, src_stride, 
src1, src2, src3, src4, src5, src6, src7, src8);
 
 2234         src += (8 * src_stride);
 
 2237                    vec0, vec1, vec2, vec3);
 
 2238         ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
 2239                    vec4, vec5, vec6, vec7);
 
 2240         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2241                     tmp0, tmp1, tmp2, tmp3);
 
 2245         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
 2247         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2248                     tmp0, tmp1, tmp2, tmp3);
 
 2252         ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
 
 2253         dst += (8 * dst_stride);
 
 2260                          const uint8_t *
src, ptrdiff_t src_stride,
 
 2261                          int height, 
int mx, 
int my)
 
 2274                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2275                           int height, 
int mx, 
int my)
 
 2279     v16u8 
src0, 
src1, src2, src3, src4;
 
 2280     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2281     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2286     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2291     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 2293         src += (4 * src_stride);
 
 2297         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2303         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 2304         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 2305         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2311         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 2317         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 2328                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2329                           int height, 
int mx, 
int my)
 
 2333     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9;
 
 2334     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2335     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2340     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2346     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 2351         LD_UB4(
src + 16, src_stride, src6, src7, src8, src9);
 
 2352         src += (4 * src_stride);
 
 2354         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2358         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2363         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 2364         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 2365         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 2370         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 2375         ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
 
 2376         ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
 
 2377         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2382         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2387         ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
 
 2388         ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
 
 2389         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 2392         PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
 
 2394         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 2397         PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
 
 2398         dst += (4 * dst_stride);
 
 2406                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2407                           int height, 
int mx, 
int my)
 
 2411     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2412     v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2413     v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 2418     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 2423     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
 2425         LD_UB2(
src + 16, src_stride, src4, src5);
 
 2426         LD_UB2(
src + 32, src_stride, src7, src8);
 
 2427         LD_UB2(
src + 48, src_stride, src10, src11);
 
 2428         src += (2 * src_stride);
 
 2432         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2437         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2442         ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
 
 2443         ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
 
 2444         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 2449         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 2454         ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
 
 2455         ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
 
 2456         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2461         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2466         ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
 
 2467         ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
 
 2468         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 2473         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 2477         dst += (2 * dst_stride);
 
 2487                                uint8_t *dst, 
int32_t dst_stride,
 
 2488                                const int8_t *filter_horiz, 
const int8_t *filter_vert)
 
 2491     v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
 
 2492     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, 
filt, tmp0, tmp1;
 
 2498     filt_hz = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2501     filt_vt = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2507     hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
 
 2508     hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
 
 2510     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2511     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 2515     ST_W2(res0, 0, 1, dst, dst_stride);
 
 2516     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
 
 2520                                uint8_t *dst, 
int32_t dst_stride,
 
 2521                                const int8_t *filter_horiz, 
const int8_t *filter_vert)
 
 2523     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, 
mask;
 
 2524     v16i8 res0, res1, res2, res3;
 
 2525     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
 
 2526     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 2527     v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, 
filt;
 
 2533     filt_hz = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2536     filt_vt = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2539     src += (8 * src_stride);
 
 2547     SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
 
 2549     hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
 
 2551     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2552     ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
 
 2553     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
 
 2554                 vec4, vec5, vec6, vec7);
 
 2557     PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
 
 2558                 res0, res1, res2, res3);
 
 2559     ST_W2(res0, 0, 1, dst, dst_stride);
 
 2560     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
 
 2561     ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
 
 2562     ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
 
 2566                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2567                           int height, 
int mx, 
int my)
 
 2574                                   filter_horiz, filter_vert);
 
 2575     } 
else if (8 == 
height) {
 
 2577                                   filter_horiz, filter_vert);
 
 2582                                uint8_t *dst, 
int32_t dst_stride,
 
 2583                                const int8_t *filter_horiz, 
const int8_t *filter_vert)
 
 2586     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
 
 2587     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
 2594     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 2597     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 2603     vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2604     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
 2607     vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2608     tmp1 = __msa_dotp_u_h(vec1, filt_vt);
 
 2611     vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2612     tmp2 = __msa_dotp_u_h(vec2, filt_vt);
 
 2615     vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2616     tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
 2621     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
 2625                                    uint8_t *dst, 
int32_t dst_stride,
 
 2626                                    const int8_t *filter_horiz, 
const int8_t *filter_vert,
 
 2631     v16u8 filt_hz, filt_vt, vec0;
 
 2632     v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 
 2639     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 2642     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 2649     for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
 2651         src += (4 * src_stride);
 
 2654         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2655         tmp1 = __msa_dotp_u_h(vec0, filt_vt);
 
 2658         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2659         tmp2 = __msa_dotp_u_h(vec0, filt_vt);
 
 2665         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2666         tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
 2670         src += (4 * src_stride);
 
 2671         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2672         tmp4 = __msa_dotp_u_h(vec0, filt_vt);
 
 2677         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
 
 2680         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2681         tmp5 = __msa_dotp_u_h(vec0, filt_vt);
 
 2684         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2685         tmp6 = __msa_dotp_u_h(vec0, filt_vt);
 
 2688         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2689         tmp7 = __msa_dotp_u_h(vec0, filt_vt);
 
 2692         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2693         tmp8 = __msa_dotp_u_h(vec0, filt_vt);
 
 2698         ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
 
 2699         dst += (8 * dst_stride);
 
 2704                           const uint8_t *
src, ptrdiff_t src_stride,
 
 2705                           int height, 
int mx, 
int my)
 
 2712                                   filter_horiz, filter_vert);
 
 2715                                       filter_horiz, filter_vert, 
height);
 
 2720                            const uint8_t *
src, ptrdiff_t src_stride,
 
 2721                            int height, 
int mx, 
int my)
 
 2726     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2727     v16u8 filt_hz, filt_vt, vec0, vec1;
 
 2728     v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
 
 2735     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 2738     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 2747     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 2750         src += (4 * src_stride);
 
 2754         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2755         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2763         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 2764         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2772         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2773         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2781         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 2782         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2791                            const uint8_t *
src, ptrdiff_t src_stride,
 
 2792                            int height, 
int mx, 
int my)
 
 2796     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 2805                            const uint8_t *
src, ptrdiff_t src_stride,
 
 2806                            int height, 
int mx, 
int my)
 
 2810     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 2820                                               uint8_t *dst, 
int32_t dst_stride,
 
 2823     uint32_t tp0, tp1, tp2, tp3;
 
 2825     v16u8 filt0, dst0, vec0, vec1, res;
 
 2826     v8u16 vec2, vec3, 
filt;
 
 2832     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2835     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2838     DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
 
 2841     res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
 
 2842     res = (v16u8) __msa_aver_u_b(res, dst0);
 
 2844     ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
 
 2849                                               uint8_t *dst, 
int32_t dst_stride,
 
 2852     uint32_t tp0, tp1, tp2, tp3;
 
 2853     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2854     v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
 
 2856     v8u16 vec4, vec5, vec6, vec7, 
filt;
 
 2862     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2865     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2867     LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
 
 2871     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
 
 2874     PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
 
 2876     ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
 
 2878     ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
 2882                          const uint8_t *
src, ptrdiff_t src_stride,
 
 2883                          int height, 
int mx, 
int my)
 
 2890     } 
else if (8 == 
height) {
 
 2898                                               uint8_t *dst, 
int32_t dst_stride,
 
 2901     int64_t tp0, tp1, tp2, tp3;
 
 2903     v16u8 filt0, dst0, dst1;
 
 2904     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 2910     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2915     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2916                 vec0, vec1, vec2, vec3);
 
 2918     LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2931     int64_t tp0, tp1, tp2, tp3;
 
 2933     v16u8 filt0, dst0, dst1;
 
 2934     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 2940     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2943     src += (4 * src_stride);
 
 2946     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
 
 2949     LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2953     src += (4 * src_stride);
 
 2955     dst += (4 * dst_stride);
 
 2959     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
 
 2962     LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2966     dst += (4 * dst_stride);
 
 2970         src += (4 * src_stride);
 
 2974         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 2977         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2982         dst += (4 * dst_stride);
 
 2986         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 2989         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 2997                          const uint8_t *
src, ptrdiff_t src_stride,
 
 2998                          int height, 
int mx, 
int my)
 
 3012                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3013                           int height, 
int mx, 
int my)
 
 3017     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3018     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 3019     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3020     v8u16 res0, res1, res2, res3, res4, res5, res6, res7, 
filt;
 
 3026     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3030     src += (4 * src_stride);
 
 3036     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
 
 3038     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
 
 3042     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3052     for (loop_cnt = (
height >> 2) - 1; loop_cnt--;) {
 
 3055         src += (4 * src_stride);
 
 3061         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
 
 3063         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
 
 3067         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3080                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3081                           int height, 
int mx, 
int my)
 
 3085     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3086     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 3087     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3088     v8u16 res0, res1, res2, res3, res4, res5, res6, res7, 
filt;
 
 3094     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3096     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
 3100         src1 = __msa_sldi_b(src2, 
src0, 8);
 
 3105         src5 = __msa_sldi_b(src6, src4, 8);
 
 3112         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3113                     res0, res1, res2, res3);
 
 3114         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 3115                     res4, res5, res6, res7);
 
 3118         LD_UB2(dst, 16, dst0, dst1);
 
 3122         LD_UB2(dst, 16, dst2, dst3);
 
 3130                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3131                           int height, 
int mx, 
int my)
 
 3135     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3136     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 3137     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3138     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 3144     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3146     for (loop_cnt = 
height; loop_cnt--;) {
 
 3156         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3157                     out0, out1, out2, out3);
 
 3158         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 3159                     out4, out5, out6, out7);
 
 3162         LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
 
 3173                                               uint8_t *dst, 
int32_t dst_stride,
 
 3176     uint32_t tp0, tp1, tp2, tp3;
 
 3177     v16i8 
src0, 
src1, src2, src3, src4;
 
 3178     v16u8 dst0, 
out, filt0, src2110, src4332;
 
 3179     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3184     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 3187     src += (4 * src_stride);
 
 3192     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3195                src10_r, src21_r, src32_r, src43_r);
 
 3196     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 3197     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
 
 3201     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 3202     out = __msa_aver_u_b(
out, dst0);
 
 3204     ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
 3209                                               uint8_t *dst, 
int32_t dst_stride,
 
 3212     uint32_t tp0, tp1, tp2, tp3;
 
 3214     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
 
 3215     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 3216     v16u8 src2110, src4332, src6554, src8776, filt0;
 
 3217     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 3221     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 3224     src += (8 * src_stride);
 
 3227     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3229     LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
 
 3231     ILVR_B4_SB(
src1, 
src0, src2, 
src1, src3, src2, src4, src3, src10_r, src21_r,
 
 3233     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 3235     ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 
 3236                src87_r, src76_r, src2110, src4332, src6554, src8776);
 
 3237     DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
 
 3238                 tmp0, tmp1, tmp2, tmp3);
 
 3241     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
 
 3242     AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
 
 3243     ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
 3247                          const uint8_t *
src, ptrdiff_t src_stride,
 
 3248                          int height, 
int mx, 
int my)
 
 3255     } 
else if (8 == 
height) {
 
 3267     int64_t tp0, tp1, tp2, tp3;
 
 3268     v16u8 
src0, 
src1, src2, src3, src4;
 
 3269     v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
 
 3270     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 3275     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 3278     LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3282     ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
 
 3283     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3284                 tmp0, tmp1, tmp2, tmp3);
 
 3298     int64_t tp0, tp1, tp2, tp3;
 
 3299     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 3300     v16u8 dst0, dst1, dst2, dst3;
 
 3301     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 3302     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 3307     filt0 = (v16u8) __msa_splati_h(
filt, 0);
 
 3312     for (loop_cnt = (
height >> 3); loop_cnt--;) {
 
 3313         LD_UB8(
src, src_stride, 
src1, src2, src3, src4, src5, src6, src7, src8);
 
 3314         src += (8 * src_stride);
 
 3316         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3319         LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
 
 3324                    vec0, vec1, vec2, vec3);
 
 3325         ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
 3326                    vec4, vec5, vec6, vec7);
 
 3327         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3328                     tmp0, tmp1, tmp2, tmp3);
 
 3332         dst += (4 * dst_stride);
 
 3334         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 3335                     tmp0, tmp1, tmp2, tmp3);
 
 3339         dst += (4 * dst_stride);
 
 3346                          const uint8_t *
src, ptrdiff_t src_stride,
 
 3347                          int height, 
int mx, 
int my)
 
 3361                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3362                           int height, 
int mx, 
int my)
 
 3366     v16u8 
src0, 
src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
 
 3367     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3368     v8u16 tmp0, tmp1, tmp2, tmp3, 
filt;
 
 3372     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3377     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3379         src += (4 * src_stride);
 
 3381         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3384         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3390         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 3391         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 3392         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3398         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 3404         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 3415                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3416                           int height, 
int mx, 
int my)
 
 3420     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9;
 
 3421     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3422     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 3423     v8u16 tmp0, tmp1, tmp2, tmp3, 
filt;
 
 3427     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3432     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3434         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3438         LD_UB4(
src + 16, src_stride, src6, src7, src8, src9);
 
 3439         LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
 
 3440         src += (4 * src_stride);
 
 3442         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3447         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3452         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 3453         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 3454         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 3459         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 3464         ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
 
 3465         ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
 
 3466         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3471         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3476         ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
 
 3477         ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
 
 3478         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 3483         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 3487         dst += (4 * dst_stride);
 
 3495                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3496                           int height, 
int mx, 
int my)
 
 3500     v16u8 
src0, 
src1, src2, src3, src4, src5;
 
 3501     v16u8 src6, src7, src8, src9, src10, src11, filt0;
 
 3502     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3503     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3504     v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 3509     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3514     for (loop_cnt = (
height >> 1); loop_cnt--;) {
 
 3516         LD_UB2(dst, dst_stride, dst0, dst1);
 
 3517         LD_UB2(
src + 16, src_stride, src4, src5);
 
 3518         LD_UB2(dst + 16, dst_stride, dst2, dst3);
 
 3519         LD_UB2(
src + 32, src_stride, src7, src8);
 
 3520         LD_UB2(dst + 32, dst_stride, dst4, dst5);
 
 3521         LD_UB2(
src + 48, src_stride, src10, src11);
 
 3522         LD_UB2(dst + 48, dst_stride, dst6, dst7);
 
 3523         src += (2 * src_stride);
 
 3527         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3532         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3537         ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
 
 3538         ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
 
 3539         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 3544         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 3549         ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
 
 3550         ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
 
 3551         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3556         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3561         ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
 
 3562         ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
 
 3563         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 3568         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 3572         dst += (2 * dst_stride);
 
 3585                                                    const int8_t *filter_horiz,
 
 3586                                                    const int8_t *filter_vert)
 
 3588     uint32_t tp0, tp1, tp2, tp3;
 
 3590     v16u8 filt_hz, filt_vt, vec0, vec1;
 
 3592     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, 
filt;
 
 3598     filt_hz = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3601     filt_vt = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3608     hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
 
 3609     hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
 
 3610     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3612     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3615     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3619     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 3620     out = __msa_aver_u_b(
out, dst0);
 
 3622     ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
 
 3629                                                    const int8_t *filter_horiz,
 
 3630                                                    const int8_t *filter_vert)
 
 3632     uint32_t tp0, tp1, tp2, tp3;
 
 3633     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, 
mask;
 
 3634     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
 
 3636     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 3637     v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
 
 3644     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 3647     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 3650     src += (8 * src_stride);
 
 3658     SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
 
 3660     hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
 
 3662     LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3664     LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
 
 3666     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3667     ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
 
 3668     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
 
 3669                 tmp0, tmp1, tmp2, tmp3);
 
 3674     ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
 3678                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3679                           int height, 
int mx, 
int my)
 
 3686                                                filter_horiz, filter_vert);
 
 3687     } 
else if (8 == 
height) {
 
 3689                                                filter_horiz, filter_vert);
 
 3697                                                    const int8_t *filter_horiz,
 
 3698                                                    const int8_t *filter_vert)
 
 3700     uint64_t tp0, tp1, tp2, tp3;
 
 3702     v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
 
 3703     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
 3710     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 3713     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 3716     src += (5 * src_stride);
 
 3718     LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3723     vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3724     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
 3727     vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3728     tmp1 = __msa_dotp_u_h(vec1, filt_vt);
 
 3731     vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3732     tmp2 = __msa_dotp_u_h(vec2, filt_vt);
 
 3735     vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3736     tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
 3747                                                        const int8_t *filter_horiz,
 
 3748                                                        const int8_t *filter_vert,
 
 3752     uint64_t tp0, tp1, tp2, tp3;
 
 3754     v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
 
 3755     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
 3762     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 3765     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 3772     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3774         src += (4 * src_stride);
 
 3777         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3778         tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
 3781         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3782         tmp1 = __msa_dotp_u_h(vec0, filt_vt);
 
 3788         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3789         tmp2 = __msa_dotp_u_h(vec0, filt_vt);
 
 3792         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3793         tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
 3797         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 3801         dst += (4 * dst_stride);
 
 3806                           const uint8_t *
src, ptrdiff_t src_stride,
 
 3807                           int height, 
int mx, 
int my)
 
 3814                                                filter_horiz, filter_vert);
 
 3818                                                    filter_horiz, filter_vert,
 
 3824                            const uint8_t *
src, ptrdiff_t src_stride,
 
 3825                            int height, 
int mx, 
int my)
 
 3830     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3831     v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
 
 3832     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
 
 3839     filt_hz = (v16u8) __msa_splati_h(
filt, 0);
 
 3842     filt_vt = (v16u8) __msa_splati_h(
filt, 0);
 
 3850     for (loop_cnt = (
height >> 2); loop_cnt--;) {
 
 3853         src += (4 * src_stride);
 
 3854         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3858         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3859         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3867         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 3868         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3876         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3877         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3885         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 3886         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3895                            const uint8_t *
src, ptrdiff_t src_stride,
 
 3896                            int height, 
int mx, 
int my)
 
 3900     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 3909                            const uint8_t *
src, ptrdiff_t src_stride,
 
 3910                            int height, 
int mx, 
int my)
 
 3914     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 3923                             uint8_t *dst, 
int32_t dst_stride,
 
 3927     uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
 
 3930         for (cnt = 
height >> 3; cnt--;) {
 
 3931             LD4(
src, src_stride, out0, out1, out2, out3);
 
 3932             src += (4 * src_stride);
 
 3933             LD4(
src, src_stride, out4, out5, out6, out7);
 
 3934             src += (4 * src_stride);
 
 3936             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 3937             dst += (4 * dst_stride);
 
 3938             SD4(out4, out5, out6, out7, dst, dst_stride);
 
 3939             dst += (4 * dst_stride);
 
 3941     } 
else if (0 == 
height % 4) {
 
 3942         for (cnt = (
height / 4); cnt--;) {
 
 3943             LD4(
src, src_stride, out0, out1, out2, out3);
 
 3944             src += (4 * src_stride);
 
 3946             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 3947             dst += (4 * dst_stride);
 
 3953                              uint8_t *dst, 
int32_t dst_stride,
 
 3957     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 3961         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3962     } 
else if (16 == 
height) {
 
 3964         src += (8 * src_stride);
 
 3965         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3966         dst += (8 * dst_stride);
 
 3968         src += (8 * src_stride);
 
 3969         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3970         dst += (8 * dst_stride);
 
 3971     } 
else if (32 == 
height) {
 
 3973         src += (8 * src_stride);
 
 3974         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3975         dst += (8 * dst_stride);
 
 3977         src += (8 * src_stride);
 
 3978         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3979         dst += (8 * dst_stride);
 
 3981         src += (8 * src_stride);
 
 3982         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3983         dst += (8 * dst_stride);
 
 3985         ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 3986     } 
else if (0 == 
height % 4) {
 
 3987         for (cnt = (
height >> 2); cnt--;) {
 
 3989             src += (4 * src_stride);
 
 3991             dst += (4 * dst_stride);
 
 3997                              uint8_t *dst, 
int32_t dst_stride,
 
 4001     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4004         for (cnt = (
height >> 3); cnt--;) {
 
 4006             ST_UB8(
src0, 
src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
 4009             src += (8 * src_stride);
 
 4012             dst += (8 * dst_stride);
 
 4014     } 
else if (0 == 
height % 4) {
 
 4015         for (cnt = (
height >> 2); cnt--;) {
 
 4017             LD_UB4(
src + 16, src_stride, src4, src5, src6, src7);
 
 4018             src += (4 * src_stride);
 
 4020             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
 4021             dst += (4 * dst_stride);
 
 4027                              uint8_t *dst, 
int32_t dst_stride,
 
 4031     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4032     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
 
 4034     for (cnt = (
height >> 2); cnt--;) {
 
 4037         LD_UB4(
src, 16, src4, src5, src6, src7);
 
 4039         LD_UB4(
src, 16, src8, src9, src10, src11);
 
 4041         LD_UB4(
src, 16, src12, src13, src14, src15);
 
 4046         ST_UB4(src4, src5, src6, src7, dst, 16);
 
 4048         ST_UB4(src8, src9, src10, src11, dst, 16);
 
 4050         ST_UB4(src12, src13, src14, src15, dst, 16);
 
 4056                            uint8_t *dst, 
int32_t dst_stride,
 
 4059     uint32_t tp0, tp1, tp2, tp3;
 
 4060     v16u8 
src0 = { 0 }, 
src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
 
 4063         LW4(
src, src_stride, tp0, tp1, tp2, tp3);
 
 4064         src += 4 * src_stride;
 
 4066         LW4(
src, src_stride, tp0, tp1, tp2, tp3);
 
 4068         LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 4070         LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
 
 4073         ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
 
 4074     } 
else if (4 == 
height) {
 
 4075         LW4(
src, src_stride, tp0, tp1, tp2, tp3);
 
 4077         LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 4079         dst0 = __msa_aver_u_b(
src0, dst0);
 
 4080         ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
 
 4085                            uint8_t *dst, 
int32_t dst_stride,
 
 4089     uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 
 4091     v16u8 dst0, dst1, dst2, dst3;
 
 4094         for (cnt = (
height >> 3); cnt--;) {
 
 4095             LD4(
src, src_stride, tp0, tp1, tp2, tp3);
 
 4096             src += 4 * src_stride;
 
 4097             LD4(
src, src_stride, tp4, tp5, tp6, tp7);
 
 4098             src += 4 * src_stride;
 
 4103             LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 4104             LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
 
 4111             ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
 
 4112             dst += 8 * dst_stride;
 
 4114     } 
else if (4 == 
height) {
 
 4115         LD4(
src, src_stride, tp0, tp1, tp2, tp3);
 
 4118         LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
 
 4122         ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
 
 4127                             uint8_t *dst, 
int32_t dst_stride,
 
 4131     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4132     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4135         for (cnt = (
height / 8); cnt--;) {
 
 4137             src += (8 * src_stride);
 
 4138             LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 4141                         dst0, dst1, dst2, dst3);
 
 4142             AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4143                         dst4, dst5, dst6, dst7);
 
 4144             ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
 
 4145             dst += (8 * dst_stride);
 
 4147     } 
else if (0 == (
height % 4)) {
 
 4148         for (cnt = (
height / 4); cnt--;) {
 
 4150             src += (4 * src_stride);
 
 4151             LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 4154                         dst0, dst1, dst2, dst3);
 
 4155             ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
 
 4156             dst += (4 * dst_stride);
 
 4162                             uint8_t *dst, 
int32_t dst_stride,
 
 4166     uint8_t *dst_dup = dst;
 
 4167     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4168     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
 
 4169     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4170     v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
 4173         for (cnt = (
height / 8); cnt--;) {
 
 4176             src += (4 * src_stride);
 
 4177             LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
 
 4178             LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
 
 4179             dst_dup += (4 * dst_stride);
 
 4180             LD_UB4(
src, src_stride, src8, src10, src12, src14);
 
 4181             LD_UB4(
src + 16, src_stride, src9, src11, src13, src15);
 
 4182             src += (4 * src_stride);
 
 4183             LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
 
 4184             LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
 
 4185             dst_dup += (4 * dst_stride);
 
 4188                         dst0, dst1, dst2, dst3);
 
 4189             AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4190                         dst4, dst5, dst6, dst7);
 
 4191             AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
 
 4192                         dst8, dst9, dst10, dst11);
 
 4193             AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
 
 4194                         dst12, dst13, dst14, dst15);
 
 4196             ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
 
 4197             ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
 
 4198             dst += (4 * dst_stride);
 
 4199             ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
 
 4200             ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
 
 4201             dst += (4 * dst_stride);
 
 4203     } 
else if (0 == (
height % 4)) {
 
 4204         for (cnt = (
height / 4); cnt--;) {
 
 4207             src += (4 * src_stride);
 
 4208             LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
 
 4209             LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
 
 4210             dst_dup += (4 * dst_stride);
 
 4213                         dst0, dst1, dst2, dst3);
 
 4214             AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4215                         dst4, dst5, dst6, dst7);
 
 4217             ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
 
 4218             ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
 
 4219             dst += (4 * dst_stride);
 
 4225                             uint8_t *dst, 
int32_t dst_stride,
 
 4229     uint8_t *dst_dup = dst;
 
 4230     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4231     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
 
 4232     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4233     v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
 4235     for (cnt = (
height / 4); cnt--;) {
 
 4238         LD_UB4(
src, 16, src4, src5, src6, src7);
 
 4240         LD_UB4(
src, 16, src8, src9, src10, src11);
 
 4242         LD_UB4(
src, 16, src12, src13, src14, src15);
 
 4245         LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
 
 4246         dst_dup += dst_stride;
 
 4247         LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
 
 4248         dst_dup += dst_stride;
 
 4249         LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
 
 4250         dst_dup += dst_stride;
 
 4251         LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
 
 4252         dst_dup += dst_stride;
 
 4255                     dst0, dst1, dst2, dst3);
 
 4256         AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4257                     dst4, dst5, dst6, dst7);
 
 4258         AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
 
 4259                     dst8, dst9, dst10, dst11);
 
 4260         AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
 
 4261                     dst12, dst13, dst14, dst15);
 
 4263         ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
 
 4265         ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
 
 4267         ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
 
 4269         ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
 
 4276          {0, 1, -5, 126, 8, -3, 1, 0},
 
 4277          {-1, 3, -10, 122, 18, -6, 2, 0},
 
 4278          {-1, 4, -13, 118, 27, -9, 3, -1},
 
 4279          {-1, 4, -16, 112, 37, -11, 4, -1},
 
 4280          {-1, 5, -18, 105, 48, -14, 4, -1},
 
 4281          {-1, 5, -19, 97, 58, -16, 5, -1},
 
 4282          {-1, 6, -19, 88, 68, -18, 5, -1},
 
 4283          {-1, 6, -19, 78, 78, -19, 6, -1},
 
 4284          {-1, 5, -18, 68, 88, -19, 6, -1},
 
 4285          {-1, 5, -16, 58, 97, -19, 5, -1},
 
 4286          {-1, 4, -14, 48, 105, -18, 5, -1},
 
 4287          {-1, 4, -11, 37, 112, -16, 4, -1},
 
 4288          {-1, 3, -9, 27, 118, -13, 4, -1},
 
 4289          {0, 2, -6, 18, 122, -10, 3, -1},
 
 4290          {0, 1, -3, 8, 126, -5, 1, 0},
 
 4292         {-1, 3, -7, 127, 8, -3, 1, 0},
 
 4293         {-2, 5, -13, 125, 17, -6, 3, -1},
 
 4294         {-3, 7, -17, 121, 27, -10, 5, -2},
 
 4295         {-4, 9, -20, 115, 37, -13, 6, -2},
 
 4296         {-4, 10, -23, 108, 48, -16, 8, -3},
 
 4297         {-4, 10, -24, 100, 59, -19, 9, -3},
 
 4298         {-4, 11, -24, 90, 70, -21, 10, -4},
 
 4299         {-4, 11, -23, 80, 80, -23, 11, -4},
 
 4300         {-4, 10, -21, 70, 90, -24, 11, -4},
 
 4301         {-3, 9, -19, 59, 100, -24, 10, -4},
 
 4302         {-3, 8, -16, 48, 108, -23, 10, -4},
 
 4303         {-2, 6, -13, 37, 115, -20, 9, -4},
 
 4304         {-2, 5, -10, 27, 121, -17, 7, -3},
 
 4305         {-1, 3, -6, 17, 125, -13, 5, -2},
 
 4306         {0, 1, -3, 8, 127, -7, 3, -1},
 
 4308         {-3, -1, 32, 64, 38, 1, -3, 0},
 
 4309         {-2, -2, 29, 63, 41, 2, -3, 0},
 
 4310         {-2, -2, 26, 63, 43, 4, -4, 0},
 
 4311         {-2, -3, 24, 62, 46, 5, -4, 0},
 
 4312         {-2, -3, 21, 60, 49, 7, -4, 0},
 
 4313         {-1, -4, 18, 59, 51, 9, -4, 0},
 
 4314         {-1, -4, 16, 57, 53, 12, -4, -1},
 
 4315         {-1, -4, 14, 55, 55, 14, -4, -1},
 
 4316         {-1, -4, 12, 53, 57, 16, -4, -1},
 
 4317         {0, -4, 9, 51, 59, 18, -4, -1},
 
 4318         {0, -4, 7, 49, 60, 21, -3, -2},
 
 4319         {0, -4, 5, 46, 62, 24, -3, -2},
 
 4320         {0, -4, 4, 43, 63, 26, -2, -2},
 
 4321         {0, -3, 2, 41, 63, 29, -2, -2},
 
 4322         {0, -3, 1, 38, 64, 32, -1, -3},
 
 4326 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \ 
 4327 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4328                                         const uint8_t *src,                    \ 
 4329                                         ptrdiff_t srcstride,                   \ 
 4330                                         int h, int mx, int my)                 \ 
 4332     const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \ 
 4334     common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \ 
 4337 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4338                                         const uint8_t *src,                    \ 
 4339                                         ptrdiff_t srcstride,                   \ 
 4340                                         int h, int mx, int my)                 \ 
 4342     const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \ 
 4344     common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \ 
 4347 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \ 
 4348                                          const uint8_t *src,                   \ 
 4349                                          ptrdiff_t srcstride,                  \ 
 4350                                          int h, int mx, int my)                \ 
 4352     const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \ 
 4353     const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \ 
 4355     common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \ 
 4359 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4360                                         const uint8_t *src,                    \ 
 4361                                         ptrdiff_t srcstride,                   \ 
 4362                                         int h, int mx, int my)                 \ 
 4364     const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \ 
 4366     common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \ 
 4367                                             dststride, filter, h);             \ 
 4370 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4371                                         const uint8_t *src,                    \ 
 4372                                         ptrdiff_t srcstride,                   \ 
 4373                                         int h, int mx, int my)                 \ 
 4375     const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \ 
 4377     common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \ 
 4381 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \ 
 4382                                          const uint8_t *src,                   \ 
 4383                                          ptrdiff_t srcstride,                  \ 
 4384                                          int h, int mx, int my)                \ 
 4386     const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \ 
 4387     const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \ 
 4389     common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \ 
 4390                                                  dststride, hfilter,           \ 
 4394 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \ 
 4395 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \ 
 4396                          const uint8_t *src, ptrdiff_t srcstride,  \ 
 4397                          int h, int mx, int my)                    \ 
 4400     copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \ 
 4403 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \ 
 4404                         const uint8_t *src, ptrdiff_t srcstride,   \ 
 4405                         int h, int mx, int my)                     \ 
 4408     avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \ 
 4411 #define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \ 
 4412 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \ 
 4413                         const uint8_t *src, ptrdiff_t srcstride,  \ 
 4414                         int h, int mx, int my)                    \ 
 4417     avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \ 
 4444 #undef VP9_8TAP_MIPS_MSA_FUNC 
 4445 #undef VP9_COPY_AVG_MIPS_MSA_FUNC 
 4446 #undef VP9_AVG_MIPS_MSA_FUNC