25 #define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)           \ 
   27     v16u8 p1_a_sub_q1, p0_a_sub_q0;                              \ 
   29     p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                        \ 
   30     p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                        \ 
   31     p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1);  \ 
   32     p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);      \ 
   33     mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);             \ 
   34     mask = ((v16u8) mask <= b_limit);                            \ 
   37 #define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \ 
   40     v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \ 
   41     v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \ 
   42     v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \ 
   44     p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80);                       \ 
   45     p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80);                       \ 
   46     q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80);                       \ 
   47     q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80);                       \ 
   49     filt = __msa_subs_s_b(p1_m, q1_m);                                  \ 
   51     filt = filt & (v16i8) hev_in;                                       \ 
   53     q0_sub_p0 = q0_m - p0_m;                                            \ 
   54     filt_sign = __msa_clti_s_b(filt, 0);                                \ 
   56     cnst3h = __msa_ldi_h(3);                                            \ 
   57     q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);           \ 
   58     q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);  \ 
   59     filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                     \ 
   60     filt_r += q0_sub_p0_r;                                              \ 
   61     filt_r = __msa_sat_s_h(filt_r, 7);                                  \ 
   63     q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0);           \ 
   64     q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h);  \ 
   65     filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                     \ 
   66     filt_l += q0_sub_p0_l;                                              \ 
   67     filt_l = __msa_sat_s_h(filt_l, 7);                                  \ 
   69     filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);               \ 
   70     filt = filt & (v16i8) mask_in;                                      \ 
   72     cnst4b = __msa_ldi_b(4);                                            \ 
   73     filt1 = __msa_adds_s_b(filt, cnst4b);                               \ 
   76     cnst3b = __msa_ldi_b(3);                                            \ 
   77     filt2 = __msa_adds_s_b(filt, cnst3b);                               \ 
   80     q0_m = __msa_subs_s_b(q0_m, filt1);                                 \ 
   81     q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80);                       \ 
   82     p0_m = __msa_adds_s_b(p0_m, filt2);                                 \ 
   83     p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80);                       \ 
   85     filt = __msa_srari_b(filt1, 1);                                     \ 
   86     hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                        \ 
   87     filt = filt & (v16i8) hev_in;                                       \ 
   89     q1_m = __msa_subs_s_b(q1_m, filt);                                  \ 
   90     q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80);                       \ 
   91     p1_m = __msa_adds_s_b(p1_m, filt);                                  \ 
   92     p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80);                       \ 
   95 #define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)           \ 
   97     v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;        \ 
   98     v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;            \ 
   99     v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;         \ 
  101     p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                       \ 
  102     p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                       \ 
  103     q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                       \ 
  104     q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                       \ 
  106     filt = __msa_subs_s_b(p1_m, q1_m);                              \ 
  108     q0_sub_p0 = q0_m - p0_m;                                        \ 
  109     filt_sign = __msa_clti_s_b(filt, 0);                            \ 
  111     cnst3h = __msa_ldi_h(3);                                        \ 
  112     q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \ 
  113     q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \ 
  114     q0_sub_p0_r *= cnst3h;                                          \ 
  115     filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \ 
  116     filt_r += q0_sub_p0_r;                                          \ 
  117     filt_r = __msa_sat_s_h(filt_r, 7);                              \ 
  119     q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \ 
  120     q0_sub_p0_l *= cnst3h;                                          \ 
  121     filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \ 
  122     filt_l += q0_sub_p0_l;                                          \ 
  123     filt_l = __msa_sat_s_h(filt_l, 7);                              \ 
  125     filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \ 
  126     filt = filt & (v16i8) (mask);                                   \ 
  128     cnst4b = __msa_ldi_b(4);                                        \ 
  129     filt1 = __msa_adds_s_b(filt, cnst4b);                           \ 
  132     cnst3b = __msa_ldi_b(3);                                        \ 
  133     filt2 = __msa_adds_s_b(filt, cnst3b);                           \ 
  136     q0_m = __msa_subs_s_b(q0_m, filt1);                             \ 
  137     p0_m = __msa_adds_s_b(p0_m, filt2);                             \ 
  138     q0_in = __msa_xori_b((v16u8) q0_m, 0x80);                       \ 
  139     p0_in = __msa_xori_b((v16u8) p0_m, 0x80);                       \ 
  142 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)             \ 
  144     v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                       \ 
  145     v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                          \ 
  146     v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;               \ 
  147     v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;       \ 
  148     v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                        \ 
  150     cnst3h = __msa_ldi_h(3);                                        \ 
  152     p2_m = (v16i8) __msa_xori_b(p2, 0x80);                          \ 
  153     p1_m = (v16i8) __msa_xori_b(p1, 0x80);                          \ 
  154     p0_m = (v16i8) __msa_xori_b(p0, 0x80);                          \ 
  155     q0_m = (v16i8) __msa_xori_b(q0, 0x80);                          \ 
  156     q1_m = (v16i8) __msa_xori_b(q1, 0x80);                          \ 
  157     q2_m = (v16i8) __msa_xori_b(q2, 0x80);                          \ 
  159     filt = __msa_subs_s_b(p1_m, q1_m);                              \ 
  160     q0_sub_p0 = q0_m - p0_m;                                        \ 
  161     q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \ 
  162     filt_sign = __msa_clti_s_b(filt, 0);                            \ 
  165     q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \ 
  166     q0_sub_p0_r *= cnst3h;                                          \ 
  167     filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \ 
  168     filt_r = filt_r + q0_sub_p0_r;                                  \ 
  169     filt_r = __msa_sat_s_h(filt_r, 7);                              \ 
  172     q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \ 
  173     q0_sub_p0_l *= cnst3h;                                          \ 
  174     filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \ 
  175     filt_l = filt_l + q0_sub_p0_l;                                  \ 
  176     filt_l = __msa_sat_s_h(filt_l, 7);                              \ 
  179     filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \ 
  180     filt = filt & (v16i8) mask;                                     \ 
  181     filt2 = filt & (v16i8) hev;                                     \ 
  184     hev = __msa_xori_b(hev, 0xff);                                  \ 
  185     filt = filt & (v16i8) hev;                                      \ 
  186     cnst4b = __msa_ldi_b(4);                                        \ 
  187     filt1 = __msa_adds_s_b(filt2, cnst4b);                          \ 
  189     cnst3b = __msa_ldi_b(3);                                        \ 
  190     filt2 = __msa_adds_s_b(filt2, cnst3b);                          \ 
  192     q0_m = __msa_subs_s_b(q0_m, filt1);                             \ 
  193     p0_m = __msa_adds_s_b(p0_m, filt2);                             \ 
  195     filt_sign = __msa_clti_s_b(filt, 0);                            \ 
  196     ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                   \ 
  198     cnst27h = __msa_ldi_h(27);                                      \ 
  199     cnst63h = __msa_ldi_h(63);                                      \ 
  202     u_r = filt_r * cnst27h;                                         \ 
  205     u_r = __msa_sat_s_h(u_r, 7);                                    \ 
  207     u_l = filt_l * cnst27h;                                         \ 
  210     u_l = __msa_sat_s_h(u_l, 7);                                    \ 
  212     u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \ 
  213     q0_m = __msa_subs_s_b(q0_m, u);                                 \ 
  214     q0 = __msa_xori_b((v16u8) q0_m, 0x80);                          \ 
  215     p0_m = __msa_adds_s_b(p0_m, u);                                 \ 
  216     p0 = __msa_xori_b((v16u8) p0_m, 0x80);                          \ 
  217     cnst18h = __msa_ldi_h(18);                                      \ 
  218     u_r = filt_r * cnst18h;                                         \ 
  221     u_r = __msa_sat_s_h(u_r, 7);                                    \ 
  224     u_l = filt_l * cnst18h;                                         \ 
  227     u_l = __msa_sat_s_h(u_l, 7);                                    \ 
  229     u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \ 
  230     q1_m = __msa_subs_s_b(q1_m, u);                                 \ 
  231     q1 = __msa_xori_b((v16u8) q1_m, 0x80);                          \ 
  232     p1_m = __msa_adds_s_b(p1_m, u);                                 \ 
  233     p1 = __msa_xori_b((v16u8) p1_m, 0x80);                          \ 
  235     u_r += filt_r + cnst63h;                                        \ 
  237     u_r = __msa_sat_s_h(u_r, 7);                                    \ 
  241     u_l += filt_l + cnst63h;                                        \ 
  243     u_l = __msa_sat_s_h(u_l, 7);                                    \ 
  245     u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \ 
  246     q2_m = __msa_subs_s_b(q2_m, u);                                 \ 
  247     q2 = __msa_xori_b((v16u8) q2_m, 0x80);                          \ 
  248     p2_m = __msa_adds_s_b(p2_m, u);                                 \ 
  249     p2 = __msa_xori_b((v16u8) p2_m, 0x80);                          \ 
  252 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \ 
  253                      q0_in, q1_in, q2_in, q3_in,                   \ 
  254                      limit_in, b_limit_in, thresh_in,              \ 
  255                      hev_out, mask_out, flat_out)                  \ 
  257     v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \ 
  258     v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \ 
  261     p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \ 
  262     p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \ 
  263     p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \ 
  264     q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \ 
  265     q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \ 
  266     q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \ 
  267     p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \ 
  268     p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \ 
  270     flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \ 
  271     hev_out = (thresh_in) < (v16u8) flat_out;                      \ 
  273     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \ 
  274     p1_asub_q1_m >>= 1;                                            \ 
  275     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \ 
  276     mask_out = (b_limit_in) < p0_asub_q0_m;                        \ 
  277     mask_out = __msa_max_u_b(flat_out, mask_out);                  \ 
  278     p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \ 
  279     mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \ 
  280     q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \ 
  281     mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \ 
  282     mask_out = (limit_in) < (v16u8) mask_out;                      \ 
  283     mask_out = __msa_xori_b(mask_out, 0xff);                       \ 
  286 #define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \ 
  291     tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx);              \ 
  292     tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx);              \ 
  294     SH(tmp0_h, pdst + stride);                                  \ 
  298                                 int limit_in, 
int thresh_in)
 
  301     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  304     b_limit = (v16u8) __msa_fill_b(b_limit_in);
 
  305     limit = (v16u8) __msa_fill_b(limit_in);
 
  306     thresh = (v16u8) __msa_fill_b(thresh_in);
 
  308     temp_src = 
src - (pitch << 2);
 
  309     LD_UB8(temp_src, pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  310     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  314     temp_src = 
src - 3 * pitch;
 
  315     ST_UB4(p2, p1, p0, 
q0, temp_src, pitch);
 
  316     temp_src += (4 * pitch);
 
  321                                  ptrdiff_t pitch, 
int b_limit_in, 
int limit_in,
 
  325     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
 
  326     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  328     v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
 
  329     v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
 
  331     b_limit = (v16u8) __msa_fill_b(b_limit_in);
 
  332     limit = (v16u8) __msa_fill_b(limit_in);
 
  333     thresh = (v16u8) __msa_fill_b(thresh_in);
 
  335     temp_src = src_u - (pitch << 2);
 
  336     LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
 
  337     temp_src = src_v - (pitch << 2);
 
  338     LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
 
  341     ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
 
  342     ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, 
q0, 
q1, q2, q3);
 
  343     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  347     p2_d = __msa_copy_u_d((v2i64) p2, 0);
 
  348     p1_d = __msa_copy_u_d((v2i64) p1, 0);
 
  349     p0_d = __msa_copy_u_d((v2i64) p0, 0);
 
  350     q0_d = __msa_copy_u_d((v2i64) 
q0, 0);
 
  351     q1_d = __msa_copy_u_d((v2i64) 
q1, 0);
 
  352     q2_d = __msa_copy_u_d((v2i64) q2, 0);
 
  353     src_u -= (pitch * 3);
 
  354     SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
 
  360     p2_d = __msa_copy_u_d((v2i64) p2, 1);
 
  361     p1_d = __msa_copy_u_d((v2i64) p1, 1);
 
  362     p0_d = __msa_copy_u_d((v2i64) p0, 1);
 
  363     q0_d = __msa_copy_u_d((v2i64) 
q0, 1);
 
  364     q1_d = __msa_copy_u_d((v2i64) 
q1, 1);
 
  365     q2_d = __msa_copy_u_d((v2i64) q2, 1);
 
  366     src_v -= (pitch * 3);
 
  367     SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
 
  375                                 int limit_in, 
int thresh_in)
 
  378     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  380     v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
 
  381     v16u8 row9, row10, row11, row12, row13, row14, row15;
 
  382     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  384     b_limit = (v16u8) __msa_fill_b(b_limit_in);
 
  385     limit = (v16u8) __msa_fill_b(limit_in);
 
  386     thresh = (v16u8) __msa_fill_b(thresh_in);
 
  388     LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
  389     temp_src += (8 * pitch);
 
  391            row8, row9, row10, row11, row12, row13, row14, row15);
 
  393                         row8, row9, row10, row11, row12, row13, row14, row15,
 
  394                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  396     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  440                                  ptrdiff_t pitch, 
int b_limit_in, 
int limit_in,
 
  443     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  445     v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
 
  446     v16u8 row9, row10, row11, row12, row13, row14, row15;
 
  447     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  449     b_limit = (v16u8) __msa_fill_b(b_limit_in);
 
  450     limit = (v16u8) __msa_fill_b(limit_in);
 
  451     thresh = (v16u8) __msa_fill_b(thresh_in);
 
  453     LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
  455            row8, row9, row10, row11, row12, row13, row14, row15);
 
  457                         row8, row9, row10, row11, row12, row13, row14, row15,
 
  458                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  460     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  508     v16u8 p1, p0, 
q1, 
q0;
 
  511     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  523     v16u8 p1, p0, 
q1, 
q0;
 
  525     v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
 
  526     v16u8 row9, row10, row11, row12, row13, row14, row15;
 
  529     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  531     LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
  532     temp_src += (8 * pitch);
 
  534            row8, row9, row10, row11, row12, row13, row14, row15);
 
  536                         row8, row9, row10, row11, row12, row13, row14, row15,
 
  543     ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 
src, pitch)
 
  544     ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, 
src + 8 * pitch, pitch)
 
  548                                        ptrdiff_t pitch, 
int b_limit_in,
 
  549                                        int limit_in, 
int thresh_in)
 
  551     uint64_t p1_d, p0_d, q0_d, q1_d;
 
  552     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  554     v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
 
  555     v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
 
  557     thresh = (v16u8) __msa_fill_b(thresh_in);
 
  558     limit = (v16u8) __msa_fill_b(limit_in);
 
  559     b_limit = (v16u8) __msa_fill_b(b_limit_in);
 
  561     src_u = src_u - (pitch << 2);
 
  562     LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
 
  563     src_u += (5 * pitch);
 
  564     src_v = src_v - (pitch << 2);
 
  565     LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
 
  566     src_v += (5 * pitch);
 
  570     ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
 
  571     ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, 
q0, 
q1, q2, q3);
 
  572     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  576     p1_d = __msa_copy_u_d((v2i64) p1, 0);
 
  577     p0_d = __msa_copy_u_d((v2i64) p0, 0);
 
  578     q0_d = __msa_copy_u_d((v2i64) 
q0, 0);
 
  579     q1_d = __msa_copy_u_d((v2i64) 
q1, 0);
 
  580     SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
 
  582     p1_d = __msa_copy_u_d((v2i64) p1, 1);
 
  583     p0_d = __msa_copy_u_d((v2i64) p0, 1);
 
  584     q0_d = __msa_copy_u_d((v2i64) 
q0, 1);
 
  585     q1_d = __msa_copy_u_d((v2i64) 
q1, 1);
 
  586     SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
 
  590                                        ptrdiff_t pitch, 
int b_limit_in,
 
  591                                        int limit_in, 
int thresh_in)
 
  593     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  595     v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
 
  596     v16u8 row9, row10, row11, row12, row13, row14, row15;
 
  597     v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
  599     thresh = (v16u8) __msa_fill_b(thresh_in);
 
  600     limit = (v16u8) __msa_fill_b(limit_in);
 
  601     b_limit = (v16u8) __msa_fill_b(b_limit_in);
 
  603     LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
  605            row8, row9, row10, row11, row12, row13, row14, row15);
 
  607                         row8, row9, row10, row11, row12, row13, row14, row15,
 
  608                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  610     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  615     tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1);
 
  616     tmp1 = (v4i32) __msa_ilvl_b((v16i8) 
q1, (v16i8) 
q0);
 
  619     ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch);
 
  620     ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch);
 
  627     v16u8 thresh, b_limit, limit;
 
  628     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  631     LD_UB8((
src - 4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  632     thresh = (v16u8) __msa_fill_b(
h);
 
  633     b_limit = (v16u8) __msa_fill_b(e);
 
  634     limit = (v16u8) __msa_fill_b(
i);
 
  636     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  647     v16u8 thresh, b_limit, limit;
 
  648     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  649     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 
  650     v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
 
  651     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
  653     LD_UB8(
src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
  655            row8, row9, row10, row11, row12, row13, row14, row15);
 
  657                         row8, row9, row10, row11, row12, row13, row14, row15,
 
  658                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  660     thresh = (v16u8) __msa_fill_b(
h);
 
  661     b_limit = (v16u8) __msa_fill_b(e);
 
  662     limit = (v16u8) __msa_fill_b(
i);
 
  664     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  673     ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch)
 
  674     ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, 
src + 8 * pitch, pitch)