26     -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
 
   30     32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
 
   33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \ 
   34                               mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \ 
   35                               res0, res1, mul_val_b0, mul_val_b1, round)       \ 
   37     v8i16 res0_m, res1_m, res2_m, res3_m;                                      \ 
   39     MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \ 
   40          mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \ 
   42     res0_m += mul_val_h1 * tmp0;                                               \ 
   43     res1_m += mul_val_h3 * tmp0;                                               \ 
   44     res2_m += mul_val_h1 * tmp0;                                               \ 
   45     res3_m += mul_val_h3 * tmp0;                                               \ 
   47     res0_m += mul_val_b0 * src0_r;                                             \ 
   48     res1_m += mul_val_b0 * src0_l;                                             \ 
   49     res2_m += (mul_val_b0 - 1) * src0_r;                                       \ 
   50     res3_m += (mul_val_b0 - 1) * src0_l;                                       \ 
   52     res0_m += mul_val_b1 * tmp1;                                               \ 
   53     res1_m += mul_val_b1 * tmp1;                                               \ 
   54     res2_m += (mul_val_b1 + 1) * tmp1;                                         \ 
   55     res3_m += (mul_val_b1 + 1) * tmp1;                                         \ 
   57     SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \ 
   58     PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \ 
   68     v8i16 vec0, vec1, vec2;
 
   71     src_data = 
LW(src_top);
 
   72     SW4(src_data, src_data, src_data, src_data, dst, stride);
 
   75         src_data = 
LW(src_left);
 
   77         vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
 
   79         vec0 = __msa_fill_h(src_left[-1]);
 
   80         vec1 = __msa_fill_h(src_top[0]);
 
   82         vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
 
   88         for (col = 0; col < 4; col++) {
 
   89             dst[stride * col] = (
uint8_t) vec2[col];
 
  101     uint16_t val0, val1, val2, val3;
 
  103     v8i16 vec0, vec1, vec2;
 
  106     src_data1 = 
LD(src_top);
 
  108     for (row = 8; row--;) {
 
  109         SD(src_data1, tmp_dst);
 
  114         src_data1 = 
LD(src_left);
 
  116         vec2 = (v8i16) __msa_insert_d((v2i64) 
zero, 0, src_data1);
 
  118         vec0 = __msa_fill_h(src_left[-1]);
 
  119         vec1 = __msa_fill_h(src_top[0]);
 
  121         vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
 
  158     v8i16 vec0, vec1, vec2, vec3;
 
  160     src = 
LD_UB(src_top);
 
  162     for (row = 16; row--;) {
 
  168         src = 
LD_UB(src_left);
 
  170         vec0 = __msa_fill_h(src_left[-1]);
 
  171         vec1 = __msa_fill_h(src_top[0]);
 
  174         SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
 
  179         ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
 
  182         src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
 
  184         for (col = 0; col < 16; col++) {
 
  185             dst[stride * col] = src[col];
 
  195     uint32_t val0, val1, val2, val3;
 
  197     v8i16 src0_r, src_top_val, src_left_val;
 
  200     val0 = src_left[0] * 0x01010101;
 
  201     val1 = src_left[1] * 0x01010101;
 
  202     val2 = src_left[2] * 0x01010101;
 
  203     val3 = src_left[3] * 0x01010101;
 
  204     SW4(val0, val1, val2, val3, dst, stride);
 
  208         src0 = (v16i8) __msa_insert_w((v4i32) 
src0, 0, val0);
 
  209         src_top_val = __msa_fill_h(src_top[-1]);
 
  210         src_left_val = __msa_fill_h(src_left[0]);
 
  212         src0_r = (v8i16) __msa_ilvr_b(zero, src0);
 
  214         src0_r -= src_top_val;
 
  216         src0_r += src_left_val;
 
  218         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
 
  219         val0 = __msa_copy_s_w((v4i32) src0, 0);
 
  229     uint64_t val0, val1, val2, val3;
 
  231     v8i16 src0_r, src_top_val, src_left_val;
 
  234     val0 = src_left[0] * 0x0101010101010101;
 
  235     val1 = src_left[1] * 0x0101010101010101;
 
  236     val2 = src_left[2] * 0x0101010101010101;
 
  237     val3 = src_left[3] * 0x0101010101010101;
 
  238     SD4(val0, val1, val2, val3, dst, stride);
 
  240     val0 = src_left[4] * 0x0101010101010101;
 
  241     val1 = src_left[5] * 0x0101010101010101;
 
  242     val2 = src_left[6] * 0x0101010101010101;
 
  243     val3 = src_left[7] * 0x0101010101010101;
 
  244     SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
 
  248         src0 = (v16i8) __msa_insert_d((v2i64) 
src0, 0, val0);
 
  249         src_top_val = __msa_fill_h(src_top[-1]);
 
  250         src_left_val = __msa_fill_h(src_left[0]);
 
  252         src0_r = (v8i16) __msa_ilvr_b(zero, src0);
 
  254         src0_r -= src_top_val;
 
  256         src0_r += src_left_val;
 
  258         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
 
  259         val0 = __msa_copy_s_d((v2i64) src0, 0);
 
  271     uint8_t inp0, inp1, inp2, inp3;
 
  273     v8i16 src0_r, src0_l, src_left_val, src_top_val;
 
  275     src_left_val = __msa_fill_h(src_left[0]);
 
  277     for (row = 4; row--;) {
 
  284         src0 = __msa_fill_b(inp0);
 
  285         src1 = __msa_fill_b(inp1);
 
  286         src2 = __msa_fill_b(inp2);
 
  287         src3 = __msa_fill_b(inp3);
 
  289         ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
 
  294         src0 = 
LD_SB(src_top);
 
  295         src_top_val = __msa_fill_h(src_top[-1]);
 
  298         SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
 
  303         ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
 
  305         src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
 
  315     uint8_t inp0, inp1, inp2, inp3;
 
  318     for (row = 0; row < 8; row++) {
 
  319         inp0 = src_left[row * 4];
 
  320         inp1 = src_left[row * 4 + 1];
 
  321         inp2 = src_left[row * 4 + 2];
 
  322         inp3 = src_left[row * 4 + 3];
 
  324         src0 = __msa_fill_b(inp0);
 
  325         src1 = __msa_fill_b(inp1);
 
  326         src2 = __msa_fill_b(inp2);
 
  327         src3 = __msa_fill_b(inp3);
 
  329         ST_SB2(src0, src0, dst, 16);
 
  331         ST_SB2(src1, src1, dst, 16);
 
  333         ST_SB2(src2, src2, dst, 16);
 
  335         ST_SB2(src3, src3, dst, 16);
 
  346     uint32_t addition = 0;
 
  347     uint32_t val0, val1, val2;
 
  351     v8u16 sum, vec0, vec1;
 
  356     sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
 
  357     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  358     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  359     sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
 
  360     addition = __msa_copy_u_w((v4i32) sum, 0);
 
  361     store = (v16u8) __msa_fill_b(addition);
 
  362     val0 = __msa_copy_u_w((v4i32) store, 0);
 
  363     SW4(val0, val0, val0, val0, dst, stride)
 
  366         ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
 
  372         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
 
  373         store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
 
  374         val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
 
  375         store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
 
  376         val0 = __msa_copy_u_w((v4i32) store, 0);
 
  385         ADD2(val0, addition, val1, addition, val0, val1);
 
  395         tmp_dst[stride * 1] = val0;
 
  396         tmp_dst[stride * 2] = val1;
 
  397         tmp_dst[stride * 3] = val2;
 
  407     uint32_t row, col, 
val;
 
  408     uint32_t addition = 0;
 
  412     v8u16 sum, vec0, vec1;
 
  418     sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
 
  419     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  420     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  421     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
 
  422     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  423     sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
 
  424     addition = __msa_copy_u_w((v4i32) sum, 0);
 
  425     store = (v16u8) __msa_fill_b(addition);
 
  426     val0 = __msa_copy_u_d((v2i64) store, 0);
 
  428     for (row = 8; row--;) {
 
  434         ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
 
  439         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
 
  440         store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
 
  441         val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
 
  442         store = (v16u8) __msa_insert_b((v16i8) store, 0, 
val);
 
  443         val0 = __msa_copy_u_d((v2i64) store, 0);
 
  447         src = (v16u8) __msa_insert_d((v2i64) 
src, 0, val0);
 
  448         vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) 
src);
 
  449         vec0 = (v8u16) __msa_fill_h(addition);
 
  452         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
 
  454         for (col = 1; col < 8; col++) {
 
  455             tmp_dst[stride * col] = vec1[col];
 
  466     uint32_t row, col, 
val;
 
  467     uint32_t addition = 0;
 
  468     v16u8 src_above1, store, src_left1;
 
  469     v8u16 sum, sum_above, sum_left;
 
  470     v8u16 vec0, vec1, vec2;
 
  473     src_above1 = 
LD_UB(src_top);
 
  474     src_left1 = 
LD_UB(src_left);
 
  476     HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
 
  477     sum = sum_above + sum_left;
 
  478     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  479     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  480     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
 
  481     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  482     sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
 
  483     addition = __msa_copy_u_w((v4i32) sum, 0);
 
  484     store = (v16u8) __msa_fill_b(addition);
 
  486     for (row = 16; row--;) {
 
  492         vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
 
  494         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
 
  496         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
 
  498         store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
 
  499         val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
 
  500         store = (v16u8) __msa_insert_b((v16i8) store, 0, 
val);
 
  501         ST_UB(store, tmp_dst);
 
  504         vec0 = (v8u16) __msa_fill_h(addition);
 
  506         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
 
  508         store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
 
  510         for (col = 1; col < 16; col++) {
 
  511             tmp_dst[stride * col] = store[col];
 
  521     v16u8 src_above1, src_above2, store, src_left1, src_left2;
 
  522     v8u16 sum_above1, sum_above2;
 
  523     v8u16 sum_left1, sum_left2;
 
  524     v8u16 sum, sum_above, sum_left;
 
  526     LD_UB2(src_top, 16, src_above1, src_above2);
 
  527     LD_UB2(src_left, 16, src_left1, src_left2);
 
  528     HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
 
  529     HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
 
  530     sum_above = sum_above1 + sum_above2;
 
  531     sum_left = sum_left1 + sum_left2;
 
  532     sum = sum_above + sum_left;
 
  533     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  534     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  535     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
 
  536     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  537     sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
 
  538     store = (v16u8) __msa_splati_b((v16i8) sum, 0);
 
  540     for (row = 16; row--;) {
 
  541         ST_UB2(store, store, dst, 16);
 
  543         ST_UB2(store, store, dst, 16);
 
  553     v16i8 src_vec0, src_vec1;
 
  554     v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
 
  555     v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
 
  556     v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
 
  562     mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
 
  564     src_vec0 = (v16i8) __msa_insert_w((v4i32) 
zero, 0, 
src0);
 
  565     src_vec1 = (v16i8) __msa_insert_w((v4i32) 
zero, 0, 
src1);
 
  567     ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
 
  568     SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
 
  570     tmp0 = __msa_fill_h(src_top[4]);
 
  571     tmp1 = __msa_fill_h(src_left[4]);
 
  573     MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
 
  574          res0, res1, res2, res3);
 
  576     res0 += mul_val1 * tmp0;
 
  577     res1 += mul_val1 * tmp0;
 
  578     res2 += mul_val1 * tmp0;
 
  579     res3 += mul_val1 * tmp0;
 
  581     res0 += 3 * src_vec0_r;
 
  582     res1 += 2 * src_vec0_r;
 
  591     src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
  592     ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
 
  600     v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
 
  601     v8i16 src_vec0_r, src_vec1_r;
 
  602     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  603     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  604     v8i16 tmp0, tmp1, tmp2;
 
  605     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  606     v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
 
  612     src_vec0 = (v16i8) __msa_insert_d((v2i64) 
zero, 0, 
src0);
 
  613     src_vec1 = (v16i8) __msa_insert_d((v2i64) 
zero, 0, 
src1);
 
  615     ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
 
  616     SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
 
  617     SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
 
  619     tmp0 = __msa_fill_h(src_top[8]);
 
  620     tmp1 = __msa_fill_h(src_left[8]);
 
  622     MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
 
  623          res0, res1, res2, res3);
 
  624     MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
 
  625          res4, res5, res6, res7);
 
  627     tmp2 = mul_val1 * tmp0;
 
  637     res0 += 7 * src_vec0_r;
 
  638     res1 += 6 * src_vec0_r;
 
  639     res2 += 5 * src_vec0_r;
 
  640     res3 += 4 * src_vec0_r;
 
  641     res4 += 3 * src_vec0_r;
 
  642     res5 += 2 * src_vec0_r;
 
  656     PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
 
  657                 src_vec0, src_vec1, src_vec2, src_vec3);
 
  659     ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
 
  667     v8i16 src0_r, src1_r, src0_l, src1_l;
 
  669     v8i16 res0, res1, tmp0, tmp1;
 
  670     v8i16 mul_val2, mul_val3;
 
  671     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  672     v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
 
  674     src0 = 
LD_UB(src_top);
 
  675     src1 = 
LD_UB(src_left);
 
  680     mul_val2 = mul_val0 - 8;
 
  681     mul_val3 = mul_val1 + 8;
 
  683     tmp0 = __msa_fill_h(src_top[16]);
 
  684     tmp1 = __msa_fill_h(src_left[16]);
 
  688                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  689                           res0, res1, 15, 1, 5);
 
  690     ST_SH2(res0, res1, dst, stride);
 
  695                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  696                           res0, res1, 13, 3, 5);
 
  697     ST_SH2(res0, res1, dst, stride);
 
  702                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  703                           res0, res1, 11, 5, 5);
 
  704     ST_SH2(res0, res1, dst, stride);
 
  709                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  710                           res0, res1, 9, 7, 5);
 
  711     ST_SH2(res0, res1, dst, stride);
 
  716                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  717                           res0, res1, 7, 9, 5);
 
  718     ST_SH2(res0, res1, dst, stride);
 
  723                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  724                           res0, res1, 5, 11, 5);
 
  725     ST_SH2(res0, res1, dst, stride);
 
  730                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  731                           res0, res1, 3, 13, 5);
 
  732     ST_SH2(res0, res1, dst, stride);
 
  737                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  738                           res0, res1, 1, 15, 5);
 
  739     ST_SH2(res0, res1, dst, stride);
 
  748     v8i16 src0_r, src1_r, src0_l, src1_l;
 
  749     v8i16 vec0, vec1, res0, res1;
 
  751     v8i16 mul_val2, mul_val3;
 
  752     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  753     v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
 
  755     tmp0 = __msa_fill_h(src_top[32 - offset]);
 
  756     tmp1 = __msa_fill_h(src_left[32]);
 
  758     src0 = 
LD_SB(src_top);
 
  759     src1 = 
LD_SB(src_left);
 
  766     mul_val2 = mul_val0 - 8;
 
  767     mul_val3 = mul_val1 + 8;
 
  771                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  772                           res0, res1, 31, 1, 6);
 
  773     ST_SH2(res0, res1, dst, stride);
 
  778                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  779                           res0, res1, 29, 3, 6);
 
  780     ST_SH2(res0, res1, dst, stride);
 
  785                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  786                           res0, res1, 27, 5, 6);
 
  787     ST_SH2(res0, res1, dst, stride);
 
  792                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  793                           res0, res1, 25, 7, 6);
 
  794     ST_SH2(res0, res1, dst, stride);
 
  799                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  800                           res0, res1, 23, 9, 6);
 
  801     ST_SH2(res0, res1, dst, stride);
 
  806                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  807                           res0, res1, 21, 11, 6);
 
  808     ST_SH2(res0, res1, dst, stride);
 
  813                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  814                           res0, res1, 19, 13, 6);
 
  815     ST_SH2(res0, res1, dst, stride);
 
  820                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  821                           res0, res1, 17, 15, 6);
 
  822     ST_SH2(res0, res1, dst, stride);
 
  831     v8i16 src0_r, src1_r, src0_l, src1_l;
 
  832     v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
 
  833     v8i16 mul_val2, mul_val3;
 
  834     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  835     v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
 
  837     tmp0 = __msa_fill_h(src_top[32 - offset]);
 
  838     tmp1 = __msa_fill_h(src_left[16]);
 
  840     src0 = 
LD_SB(src_top);
 
  841     src1 = 
LD_SB(src_left);
 
  848     mul_val2 = mul_val0 - 8;
 
  849     mul_val3 = mul_val1 + 8;
 
  853                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  854                           res0, res1, 15, 17, 6);
 
  855     ST_SH2(res0, res1, dst, stride);
 
  860                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  861                           res0, res1, 13, 19, 6);
 
  862     ST_SH2(res0, res1, dst, stride);
 
  867                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  868                           res0, res1, 11, 21, 6);
 
  869     ST_SH2(res0, res1, dst, stride);
 
  874                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  875                           res0, res1, 9, 23, 6);
 
  876     ST_SH2(res0, res1, dst, stride);
 
  881                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  882                           res0, res1, 7, 25, 6);
 
  883     ST_SH2(res0, res1, dst, stride);
 
  888                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  889                           res0, res1, 5, 27, 6);
 
  890     ST_SH2(res0, res1, dst, stride);
 
  895                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  896                           res0, res1, 3, 29, 6);
 
  897     ST_SH2(res0, res1, dst, stride);
 
  902                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  903                           res0, res1, 1, 31, 6);
 
  904     ST_SH2(res0, res1, dst, stride);
 
  913                                   (dst + 16), stride, 16);
 
  919                                   (dst + 16), stride, 16);
 
  928     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
  930     uint8_t *ref_tmp = ref_array + 4;
 
  933     int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
 
  934     int32_t idx2, fact_val2, idx3, fact_val3;
 
  938     v16i8 top0, top1, top2, top3;
 
  941     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
  942     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
  945     inv_angle_val = inv_angle[mode - 18];
 
  950     if (angle < 0 && last < -1) {
 
  951         inv_angle_val = inv_angle[mode - 18];
 
  956         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
  957             offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
 
  958             ref_tmp[h_cnt] = src_left[
offset];
 
  964     idx0 = angle_loop >> 5;
 
  965     fact_val0 = angle_loop & 31;
 
  968     idx1 = angle_loop >> 5;
 
  969     fact_val1 = angle_loop & 31;
 
  972     idx2 = angle_loop >> 5;
 
  973     fact_val2 = angle_loop & 31;
 
  976     idx3 = angle_loop >> 5;
 
  977     fact_val3 = angle_loop & 31;
 
  979     top0 = 
LD_SB(ref + idx0 + 1);
 
  980     top1 = 
LD_SB(ref + idx1 + 1);
 
  981     top2 = 
LD_SB(ref + idx2 + 1);
 
  982     top3 = 
LD_SB(ref + idx3 + 1);
 
  984     fact0 = __msa_fill_h(fact_val0);
 
  985     fact1 = __msa_fill_h(32 - fact_val0);
 
  987     fact2 = __msa_fill_h(fact_val1);
 
  988     fact3 = __msa_fill_h(32 - fact_val1);
 
  990     fact4 = __msa_fill_h(fact_val2);
 
  991     fact5 = __msa_fill_h(32 - fact_val2);
 
  993     fact6 = __msa_fill_h(fact_val3);
 
  994     fact7 = __msa_fill_h(32 - fact_val3);
 
  996     ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
 
  997     ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
 
  998     ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
 
  999                diff0, diff2, diff4, diff6);
 
 1000     SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
 
 1001     ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
 
 1002     ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
 
 1003     MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
 
 1005     diff1 += diff0 * fact1;
 
 1006     diff3 += diff2 * fact3;
 
 1009     dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
 
 1010     ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
 
 1019     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
 1020     uint8_t ref_array[3 * 32 + 4];
 
 1021     uint8_t *ref_tmp = ref_array + 8;
 
 1023     const uint8_t *src_left_tmp = src_left - 1;
 
 1025     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1026     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1028     int32_t inv_angle_val, inv_angle_val_loop;
 
 1030     v16i8 top0, top1, top2, top3;
 
 1031     v16u8 dst_val0, dst_val1;
 
 1032     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1033     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1036     inv_angle_val = inv_angle[mode - 18];
 
 1037     last = (angle) >> 2;
 
 1042         inv_angle_val_loop = inv_angle_val * last;
 
 1048         SW(tmp1, ref_tmp + 4);
 
 1049         SW(tmp2, ref_tmp + 8);
 
 1051         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1052             offset = (inv_angle_val_loop + 128) >> 8;
 
 1053             ref_tmp[h_cnt] = src_left_tmp[
offset];
 
 1054             inv_angle_val_loop += inv_angle_val;
 
 1059     for (v_cnt = 0; v_cnt < 2; v_cnt++) {
 
 1060         idx0 = (angle_loop) >> 5;
 
 1061         fact_val0 = (angle_loop) & 31;
 
 1062         angle_loop += angle;
 
 1064         idx1 = (angle_loop) >> 5;
 
 1065         fact_val1 = (angle_loop) & 31;
 
 1066         angle_loop += angle;
 
 1068         idx2 = (angle_loop) >> 5;
 
 1069         fact_val2 = (angle_loop) & 31;
 
 1070         angle_loop += angle;
 
 1072         idx3 = (angle_loop) >> 5;
 
 1073         fact_val3 = (angle_loop) & 31;
 
 1074         angle_loop += angle;
 
 1076         top0 = 
LD_SB(ref + idx0 + 1);
 
 1077         top1 = 
LD_SB(ref + idx1 + 1);
 
 1078         top2 = 
LD_SB(ref + idx2 + 1);
 
 1079         top3 = 
LD_SB(ref + idx3 + 1);
 
 1081         fact0 = __msa_fill_h(fact_val0);
 
 1082         fact1 = __msa_fill_h(32 - fact_val0);
 
 1083         fact2 = __msa_fill_h(fact_val1);
 
 1084         fact3 = __msa_fill_h(32 - fact_val1);
 
 1085         fact4 = __msa_fill_h(fact_val2);
 
 1086         fact5 = __msa_fill_h(32 - fact_val2);
 
 1087         fact6 = __msa_fill_h(fact_val3);
 
 1088         fact7 = __msa_fill_h(32 - fact_val3);
 
 1095         SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
 
 1096         SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
 
 1097         MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
 
 1098              diff1, diff3, diff5, diff7);
 
 1100         diff1 += diff0 * fact1;
 
 1101         diff3 += diff2 * fact3;
 
 1102         diff5 += diff4 * fact5;
 
 1103         diff7 += diff6 * fact7;
 
 1106         PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
 
 1107         ST8x4_UB(dst_val0, dst_val1, dst, stride);
 
 1118     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
 1119     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1120     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1123     int32_t inv_angle_val, inv_angle_val_loop;
 
 1124     uint8_t ref_array[3 * 32 + 4];
 
 1125     uint8_t *ref_tmp = ref_array + 16;
 
 1127     const uint8_t *src_left_tmp = src_left - 1;
 
 1129     v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
 
 1130     v16i8 dst0, dst1, dst2, dst3;
 
 1131     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1132     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1133     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1136     inv_angle_val = inv_angle[mode - 18];
 
 1142         inv_angle_val_loop = inv_angle_val * last;
 
 1145         tmp0 = 
LW(ref + 16);
 
 1146         ST_UB(top0, ref_tmp);
 
 1147         SW(tmp0, ref_tmp + 16);
 
 1149         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1150             offset = (inv_angle_val_loop + 128) >> 8;
 
 1151             ref_tmp[h_cnt] = src_left_tmp[
offset];
 
 1152             inv_angle_val_loop += inv_angle_val;
 
 1157     for (v_cnt = 4; v_cnt--;) {
 
 1158         idx0 = (angle_loop) >> 5;
 
 1159         fact_val0 = (angle_loop) & 31;
 
 1160         angle_loop += angle;
 
 1162         idx1 = (angle_loop) >> 5;
 
 1163         fact_val1 = (angle_loop) & 31;
 
 1164         angle_loop += angle;
 
 1166         idx2 = (angle_loop) >> 5;
 
 1167         fact_val2 = (angle_loop) & 31;
 
 1168         angle_loop += angle;
 
 1170         idx3 = (angle_loop) >> 5;
 
 1171         fact_val3 = (angle_loop) & 31;
 
 1172         angle_loop += angle;
 
 1174         LD_UB2(ref + idx0 + 1, 16, top0, top1);
 
 1175         LD_UB2(ref + idx1 + 1, 16, top2, top3);
 
 1176         LD_UB2(ref + idx2 + 1, 16, top4, top5);
 
 1177         LD_UB2(ref + idx3 + 1, 16, top6, top7);
 
 1179         fact0 = __msa_fill_h(fact_val0);
 
 1180         fact1 = __msa_fill_h(32 - fact_val0);
 
 1181         fact2 = __msa_fill_h(fact_val1);
 
 1182         fact3 = __msa_fill_h(32 - fact_val1);
 
 1183         fact4 = __msa_fill_h(fact_val2);
 
 1184         fact5 = __msa_fill_h(32 - fact_val2);
 
 1185         fact6 = __msa_fill_h(fact_val3);
 
 1186         fact7 = __msa_fill_h(32 - fact_val3);
 
 1188         SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
 
 1189         SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
 
 1199         MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
 
 1200              diff2, diff3, diff6, diff7);
 
 1201         MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
 
 1202              diff10, diff11, diff14, diff15);
 
 1204         diff2 += diff0 * fact1;
 
 1205         diff3 += diff1 * fact1;
 
 1206         diff6 += diff4 * fact3;
 
 1207         diff7 += diff5 * fact3;
 
 1208         diff10 += diff8 * fact5;
 
 1209         diff11 += diff9 * fact5;
 
 1210         diff14 += diff12 * fact7;
 
 1211         diff15 += diff13 * fact7;
 
 1215         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1216                     dst0, dst1, dst2, dst3);
 
 1217         ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
 
 1228     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
 1229     uint8_t ref_array[3 * 32 + 4];
 
 1232     const uint8_t *src_left_tmp = src_left - 1;
 
 1233     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1234     int32_t tmp0, tmp1, tmp2, tmp3;
 
 1236     int32_t inv_angle_val, inv_angle_val_loop;
 
 1238     v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
 
 1239     v16i8 dst0, dst1, dst2, dst3;
 
 1240     v8i16 fact0, fact1, fact2, fact3;
 
 1241     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1242     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1244     ref_tmp = ref_array + 32;
 
 1247     inv_angle_val = inv_angle[mode - 18];
 
 1253         inv_angle_val_loop = inv_angle_val * last;
 
 1254         LD_UB2(ref, 16, top0, top1);
 
 1260         ST_UB2(top0, top1, ref_tmp, 16);
 
 1266         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1267             offset = (inv_angle_val_loop + 128) >> 8;
 
 1268             ref_tmp[h_cnt] = src_left_tmp[
offset];
 
 1269             inv_angle_val_loop += inv_angle_val;
 
 1275     for (v_cnt = 16; v_cnt--;) {
 
 1276         idx0 = (angle_loop) >> 5;
 
 1277         fact_val0 = (angle_loop) & 31;
 
 1278         angle_loop += angle;
 
 1280         idx1 = (angle_loop) >> 5;
 
 1281         fact_val1 = (angle_loop) & 31;
 
 1282         angle_loop += angle;
 
 1284         top0 = 
LD_UB(ref + idx0 + 1);
 
 1285         top4 = 
LD_UB(ref + idx1 + 1);
 
 1286         top1 = 
LD_UB(ref + idx0 + 17);
 
 1287         top5 = 
LD_UB(ref + idx1 + 17);
 
 1288         top3 = 
LD_UB(ref + idx0 + 33);
 
 1289         top7 = 
LD_UB(ref + idx1 + 33);
 
 1291         fact0 = __msa_fill_h(fact_val0);
 
 1292         fact1 = __msa_fill_h(32 - fact_val0);
 
 1293         fact2 = __msa_fill_h(fact_val1);
 
 1294         fact3 = __msa_fill_h(32 - fact_val1);
 
 1299         SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
 
 1300         SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
 
 1310         MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
 
 1311              diff2, diff3, diff6, diff7);
 
 1312         MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
 
 1313              diff10, diff11, diff14, diff15);
 
 1315         diff2 += diff0 * fact1;
 
 1316         diff3 += diff1 * fact1;
 
 1317         diff6 += diff4 * fact1;
 
 1318         diff7 += diff5 * fact1;
 
 1319         diff10 += diff8 * fact3;
 
 1320         diff11 += diff9 * fact3;
 
 1321         diff14 += diff12 * fact3;
 
 1322         diff15 += diff13 * fact3;
 
 1326         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1327                     dst0, dst1, dst2, dst3);
 
 1329         ST_SB2(dst0, dst1, dst, 16);
 
 1331         ST_SB2(dst2, dst3, dst, 16);
 
 1342     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1343     uint8_t ref_array[3 * 32 + 4];
 
 1344     uint8_t *ref_tmp = ref_array + 4;
 
 1347     int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1348     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1349     int32_t angle, angle_loop, inv_angle_val;
 
 1351     v16i8 dst_val0, dst_val1;
 
 1352     v16u8 top0, top1, top2, top3;
 
 1354     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1355     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1363         inv_angle_val = inv_angle[mode - 11];
 
 1368         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1369             offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
 
 1370             ref_tmp[h_cnt] = src_top[
offset];
 
 1376     idx0 = angle_loop >> 5;
 
 1377     fact_val0 = angle_loop & 31;
 
 1378     angle_loop += angle;
 
 1380     idx1 = angle_loop >> 5;
 
 1381     fact_val1 = angle_loop & 31;
 
 1382     angle_loop += angle;
 
 1384     idx2 = angle_loop >> 5;
 
 1385     fact_val2 = angle_loop & 31;
 
 1386     angle_loop += angle;
 
 1388     idx3 = angle_loop >> 5;
 
 1389     fact_val3 = angle_loop & 31;
 
 1391     top0 = 
LD_UB(ref + idx0 + 1);
 
 1392     top1 = 
LD_UB(ref + idx1 + 1);
 
 1393     top2 = 
LD_UB(ref + idx2 + 1);
 
 1394     top3 = 
LD_UB(ref + idx3 + 1);
 
 1396     fact0 = __msa_fill_h(fact_val0);
 
 1397     fact1 = __msa_fill_h(32 - fact_val0);
 
 1398     fact2 = __msa_fill_h(fact_val1);
 
 1399     fact3 = __msa_fill_h(32 - fact_val1);
 
 1400     fact4 = __msa_fill_h(fact_val2);
 
 1401     fact5 = __msa_fill_h(32 - fact_val2);
 
 1402     fact6 = __msa_fill_h(fact_val3);
 
 1403     fact7 = __msa_fill_h(32 - fact_val3);
 
 1405     ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
 
 1406     ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
 
 1407     ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
 
 1408                diff0, diff2, diff4, diff6);
 
 1409     SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
 
 1410     ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
 
 1411     ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
 
 1412     MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
 
 1414     diff1 += diff0 * fact1;
 
 1415     diff3 += diff2 * fact3;
 
 1418     PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
 
 1420     diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
 
 1421     diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
 
 1423     diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
 
 1425     dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
 
 1426     dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
 
 1439     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1440     uint8_t ref_array[3 * 32 + 4];
 
 1441     uint8_t *ref_tmp = ref_array + 8;
 
 1443     const uint8_t *src_top_tmp = src_top - 1;
 
 1446     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1447     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1448     int32_t angle, angle_loop, inv_angle_val;
 
 1449     v16i8 top0, top1, top2, top3;
 
 1450     v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
 
 1451     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1452     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1455     last = (angle) >> 2;
 
 1460         inv_angle_val = inv_angle[mode - 11];
 
 1466         SW(tmp1, ref_tmp + 4);
 
 1467         SW(tmp2, ref_tmp + 8);
 
 1469         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1470             offset = (h_cnt * inv_angle_val + 128) >> 8;
 
 1471             ref_tmp[h_cnt] = src_top_tmp[
offset];
 
 1477     for (v_cnt = 0; v_cnt < 2; v_cnt++) {
 
 1480         idx0 = angle_loop >> 5;
 
 1481         fact_val0 = angle_loop & 31;
 
 1482         angle_loop += angle;
 
 1484         idx1 = angle_loop >> 5;
 
 1485         fact_val1 = angle_loop & 31;
 
 1486         angle_loop += angle;
 
 1488         idx2 = angle_loop >> 5;
 
 1489         fact_val2 = angle_loop & 31;
 
 1490         angle_loop += angle;
 
 1492         idx3 = angle_loop >> 5;
 
 1493         fact_val3 = angle_loop & 31;
 
 1494         angle_loop += angle;
 
 1496         top0 = 
LD_SB(ref + idx0 + 1);
 
 1497         top1 = 
LD_SB(ref + idx1 + 1);
 
 1498         top2 = 
LD_SB(ref + idx2 + 1);
 
 1499         top3 = 
LD_SB(ref + idx3 + 1);
 
 1501         fact0 = __msa_fill_h(fact_val0);
 
 1502         fact1 = __msa_fill_h(32 - fact_val0);
 
 1503         fact2 = __msa_fill_h(fact_val1);
 
 1504         fact3 = __msa_fill_h(32 - fact_val1);
 
 1505         fact4 = __msa_fill_h(fact_val2);
 
 1506         fact5 = __msa_fill_h(32 - fact_val2);
 
 1507         fact6 = __msa_fill_h(fact_val3);
 
 1508         fact7 = __msa_fill_h(32 - fact_val3);
 
 1514         SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
 
 1515         SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
 
 1516         MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
 
 1517              diff1, diff3, diff5, diff7);
 
 1519         diff1 += diff0 * fact1;
 
 1520         diff3 += diff2 * fact3;
 
 1521         diff5 += diff4 * fact5;
 
 1522         diff7 += diff6 * fact7;
 
 1525         PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
 
 1526                     dst_val0, dst_val1, dst_val2, dst_val3);
 
 1527         ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
 
 1529         ST4x8_UB(diff3, diff4, dst_org, stride);
 
 1540     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1541     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1542     int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
 
 1543     v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
 
 1544     v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
 
 1545     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1546     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1547     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1549     uint8_t ref_array[3 * 32 + 4];
 
 1550     uint8_t *ref_tmp = ref_array + 16;
 
 1551     const uint8_t *ref, *src_top_tmp = src_top - 1;
 
 1556     last = (angle) >> 1;
 
 1561         inv_angle_val = inv_angle[mode - 11];
 
 1564         tmp0 = 
LW(ref + 16);
 
 1565         ST_SB(top0, ref_tmp);
 
 1566         SW(tmp0, ref_tmp + 16);
 
 1568         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1569             offset = (h_cnt * inv_angle_val + 128) >> 8;
 
 1570             ref_tmp[h_cnt] = src_top_tmp[
offset];
 
 1576     for (v_cnt = 0; v_cnt < 4; v_cnt++) {
 
 1579         idx0 = angle_loop >> 5;
 
 1580         fact_val0 = angle_loop & 31;
 
 1581         angle_loop += angle;
 
 1583         idx1 = angle_loop >> 5;
 
 1584         fact_val1 = angle_loop & 31;
 
 1585         angle_loop += angle;
 
 1587         idx2 = angle_loop >> 5;
 
 1588         fact_val2 = angle_loop & 31;
 
 1589         angle_loop += angle;
 
 1591         idx3 = angle_loop >> 5;
 
 1592         fact_val3 = angle_loop & 31;
 
 1593         angle_loop += angle;
 
 1595         LD_SB2(ref + idx0 + 1, 16, top0, top1);
 
 1596         LD_SB2(ref + idx1 + 1, 16, top2, top3);
 
 1597         LD_SB2(ref + idx2 + 1, 16, top4, top5);
 
 1598         LD_SB2(ref + idx3 + 1, 16, top6, top7);
 
 1600         fact0 = __msa_fill_h(fact_val0);
 
 1601         fact1 = __msa_fill_h(32 - fact_val0);
 
 1602         fact2 = __msa_fill_h(fact_val1);
 
 1603         fact3 = __msa_fill_h(32 - fact_val1);
 
 1604         fact4 = __msa_fill_h(fact_val2);
 
 1605         fact5 = __msa_fill_h(32 - fact_val2);
 
 1606         fact6 = __msa_fill_h(fact_val3);
 
 1607         fact7 = __msa_fill_h(32 - fact_val3);
 
 1609         SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
 
 1610         SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
 
 1621         MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
 
 1622              diff2, diff3, diff6, diff7);
 
 1623         MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
 
 1624              diff10, diff11, diff14, diff15);
 
 1626         diff2 += diff0 * fact1;
 
 1627         diff3 += diff1 * fact1;
 
 1628         diff6 += diff4 * fact3;
 
 1629         diff7 += diff5 * fact3;
 
 1630         diff10 += diff8 * fact5;
 
 1631         diff11 += diff9 * fact5;
 
 1632         diff14 += diff12 * fact7;
 
 1633         diff15 += diff13 * fact7;
 
 1637         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1638                     dst_val0, dst_val1, dst_val2, dst_val3);
 
 1639         ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
 
 1640         ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
 
 1643         ST4x8_UB(diff4, diff5, dst_org, stride);
 
 1645         ST4x8_UB(diff6, diff7, dst_org, stride);
 
 1656     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1657     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
 
 1658     v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
 
 1659     v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
 
 1660     v8i16 fact0, fact1, fact2, fact3;
 
 1661     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1662     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1664     uint8_t ref_array[3 * 32 + 4];
 
 1665     uint8_t *ref_tmp = ref_array + 32;
 
 1666     const uint8_t *ref, *src_top_tmp = src_top - 1;
 
 1676         inv_angle_val = inv_angle[mode - 11];
 
 1678         LD_SB2(ref, 16, top0, top1);
 
 1679         tmp0 = 
LW(ref + 32);
 
 1680         ST_SB2(top0, top1, ref_tmp, 16);
 
 1681         SW(tmp0, ref_tmp + 32);
 
 1683         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1684             offset = (h_cnt * inv_angle_val + 128) >> 8;
 
 1685             ref_tmp[h_cnt] = src_top_tmp[
offset];
 
 1691     for (v_cnt = 0; v_cnt < 16; v_cnt++) {
 
 1693         idx0 = angle_loop >> 5;
 
 1694         fact_val0 = angle_loop & 31;
 
 1695         angle_loop += angle;
 
 1697         idx1 = angle_loop >> 5;
 
 1698         fact_val1 = angle_loop & 31;
 
 1699         angle_loop += angle;
 
 1701         top0 = 
LD_SB(ref + idx0 + 1);
 
 1702         top4 = 
LD_SB(ref + idx1 + 1);
 
 1703         top1 = 
LD_SB(ref + idx0 + 17);
 
 1704         top5 = 
LD_SB(ref + idx1 + 17);
 
 1705         top3 = 
LD_SB(ref + idx0 + 33);
 
 1706         top7 = 
LD_SB(ref + idx1 + 33);
 
 1708         fact0 = __msa_fill_h(fact_val0);
 
 1709         fact1 = __msa_fill_h(32 - fact_val0);
 
 1710         fact2 = __msa_fill_h(fact_val1);
 
 1711         fact3 = __msa_fill_h(32 - fact_val1);
 
 1716         SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
 
 1717         SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
 
 1728         MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
 
 1729              diff2, diff3, diff6, diff7);
 
 1730         MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
 
 1731              diff10, diff11, diff14, diff15);
 
 1733         diff2 += diff0 * fact1;
 
 1734         diff3 += diff1 * fact1;
 
 1735         diff6 += diff4 * fact1;
 
 1736         diff7 += diff5 * fact1;
 
 1737         diff10 += diff8 * fact3;
 
 1738         diff11 += diff9 * fact3;
 
 1739         diff14 += diff12 * fact3;
 
 1740         diff15 += diff13 * fact3;
 
 1744         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1745                     dst_val0, dst_val1, dst_val2, dst_val3);
 
 1749         ST2x4_UB(diff0, 0, dst_org, stride);
 
 1751         ST2x4_UB(diff0, 4, dst_org, stride);
 
 1753         ST2x4_UB(diff1, 0, dst_org, stride);
 
 1755         ST2x4_UB(diff1, 4, dst_org, stride);
 
 1758         ST2x4_UB(diff2, 0, dst_org, stride);
 
 1760         ST2x4_UB(diff2, 4, dst_org, stride);
 
 1762         ST2x4_UB(diff3, 0, dst_org, stride);
 
 1764         ST2x4_UB(diff3, 4, dst_org, stride);
 
 1778     src2 = 
LD_UB(src + 16);
 
 1780     for (row = 32; row--;) {
 
 1781         ST_UB2(src1, src2, dst, 16);
 
 1848     } 
else if (mode == 26) {
 
 1850     } 
else if (mode >= 18) {
 
 1866     } 
else if (mode == 26) {
 
 1868     } 
else if (mode >= 18) {
 
 1884     } 
else if (mode == 26) {
 
 1886     } 
else if (mode >= 18) {
 
 1902     } 
else if (mode == 26) {
 
 1904     } 
else if (mode >= 18) {
 
 1918     int hshift = s->sps->hshift[c_idx];
 
 1919     int vshift = s->sps->vshift[c_idx];
 
 1920     int size_in_luma_h = 16 << hshift;
 
 1921     int size_in_tbs_h = size_in_luma_h >> s->sps->log2_min_tb_size;
 
 1922     int size_in_luma_v = 16 << vshift;
 
 1923     int size_in_tbs_v = size_in_luma_v >> s->sps->log2_min_tb_size;
 
 1924     int x = x0 >> hshift;
 
 1925     int y = y0 >> vshift;
 
 1926     int x_tb = (x0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
 
 1927     int y_tb = (y0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
 
 1930         s->pps->min_tb_addr_zs[(y_tb) * (s->sps->tb_mask + 2) + (x_tb)];
 
 1935     int min_pu_width = s->sps->min_pu_width;
 
 1940     uint8_t left_array[2 * 32 + 1];
 
 1941     uint8_t filtered_left_array[2 * 32 + 1];
 
 1942     uint8_t top_array[2 * 32 + 1];
 
 1943     uint8_t filtered_top_array[2 * 32 + 1];
 
 1945     uint8_t *left = left_array + 1;
 
 1947     uint8_t *filtered_left = filtered_left_array + 1;
 
 1948     uint8_t *filtered_top = filtered_top_array + 1;
 
 1951         s->pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->sps->tb_mask) *
 
 1952                                (s->sps->tb_mask + 2) + (x_tb - 1)];
 
 1958         s->pps->min_tb_addr_zs[(y_tb - 1) * (s->sps->tb_mask + 2) +
 
 1959                                ((x_tb + size_in_tbs_h) & s->sps->tb_mask)];
 
 1961     int bottom_left_size =
 
 1962         (((y0 + 2 * size_in_luma_v) >
 
 1964                                                  2 * size_in_luma_v)) -
 
 1965          (y0 + size_in_luma_v)) >> vshift;
 
 1966     int top_right_size =
 
 1967         (((x0 + 2 * size_in_luma_h) >
 
 1968           (s->sps->
width) ? (s->sps->
width) : (x0 + 2 * size_in_luma_h)) -
 
 1969          (x0 + size_in_luma_h)) >> hshift;
 
 1971     if (s->pps->constrained_intra_pred_flag == 1) {
 
 1972         int size_in_luma_pu_v = ((size_in_luma_v) >> s->sps->log2_min_pu_size);
 
 1973         int size_in_luma_pu_h = ((size_in_luma_h) >> s->sps->log2_min_pu_size);
 
 1974         int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
 
 1975         int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
 
 1976         if (!size_in_luma_pu_h)
 
 1977             size_in_luma_pu_h++;
 
 1978         if (cand_bottom_left == 1 && on_pu_edge_x) {
 
 1979             int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
 
 1981                 ((y0 + size_in_luma_v) >> s->sps->log2_min_pu_size);
 
 1983                 ((size_in_luma_pu_v) >
 
 1984                  (s->sps->min_pu_height -
 
 1985                   y_bottom_pu) ? (s->sps->min_pu_height -
 
 1986                                   y_bottom_pu) : (size_in_luma_pu_v));
 
 1987             cand_bottom_left = 0;
 
 1988             for (i = 0; i < max; i += 2)
 
 1992                                        i) * min_pu_width]).pred_flag ==
 
 1995         if (cand_left == 1 && on_pu_edge_x) {
 
 1996             int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
 
 1997             int y_left_pu = ((y0) >> s->sps->log2_min_pu_size);
 
 1999                 ((size_in_luma_pu_v) >
 
 2000                  (s->sps->min_pu_height -
 
 2001                   y_left_pu) ? (s->sps->min_pu_height -
 
 2002                                 y_left_pu) : (size_in_luma_pu_v));
 
 2004             for (i = 0; i < max; i += 2)
 
 2008                                        i) * min_pu_width]).pred_flag ==
 
 2011         if (cand_up_left == 1) {
 
 2012             int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
 
 2013             int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
 
 2016                                  (y_top_pu) * min_pu_width]).pred_flag ==
 
 2019         if (cand_up == 1 && on_pu_edge_y) {
 
 2020             int x_top_pu = ((x0) >> s->sps->log2_min_pu_size);
 
 2021             int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
 
 2023                 ((size_in_luma_pu_h) >
 
 2024                  (s->sps->min_pu_width -
 
 2025                   x_top_pu) ? (s->sps->min_pu_width -
 
 2026                                x_top_pu) : (size_in_luma_pu_h));
 
 2028             for (i = 0; i < max; i += 2)
 
 2032                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2034         if (cand_up_right == 1 && on_pu_edge_y) {
 
 2035             int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
 
 2037                 ((x0 + size_in_luma_h) >> s->sps->log2_min_pu_size);
 
 2039                 ((size_in_luma_pu_h) >
 
 2040                  (s->sps->min_pu_width -
 
 2041                   x_right_pu) ? (s->sps->min_pu_width -
 
 2042                                  x_right_pu) : (size_in_luma_pu_h));
 
 2044             for (i = 0; i < max; i += 2)
 
 2048                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2051         vec0 = (v16u8) __msa_ldi_b(128);
 
 2053         ST_UB4(vec0, vec0, vec0, vec0, left, 16);
 
 2055         ST_UB4(vec0, vec0, vec0, vec0, top, 16);
 
 2060         left[-1] = src[(-1) + stride * (-1)];
 
 2064         vec0 = 
LD_UB(src - stride);
 
 2067     if (cand_up_right) {
 
 2068         vec0 = 
LD_UB(src - stride + 16);
 
 2069         ST_UB(vec0, (top + 16));
 
 2073                 ((src[(16 + top_right_size - 1) + stride * (-1)]) *
 
 2075             for (i = 0; i < (16 - top_right_size); i += 4)
 
 2081         for (i = 0; i < 16; i++)
 
 2082             left[i] = src[(-1) + stride * (i)];
 
 2083     if (cand_bottom_left) {
 
 2084         for (i = 16; i < 16 + bottom_left_size; i++)
 
 2085             left[i] = src[(-1) + stride * (i)];
 
 2088                 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
 
 2090             for (i = 0; i < (16 - bottom_left_size); i += 4)
 
 2091                 ((((
union unaligned_32 *) (left + 16 + bottom_left_size +
 
 2096     if (s->pps->constrained_intra_pred_flag == 1) {
 
 2097         if (cand_bottom_left || cand_left || cand_up_left || cand_up
 
 2100                 x0 + ((2 * 16) << hshift) <
 
 2101                 s->sps->
width ? 2 * 16 : (s->sps->
width - x0) >> hshift;
 
 2103                 y0 + ((2 * 16) << vshift) <
 
 2104                 s->sps->
height ? 2 * 16 : (s->sps->
height - y0) >> vshift;
 
 2105             int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
 
 2106             if (!cand_up_right) {
 
 2107                 size_max_x = x0 + ((16) << hshift) < s->sps->
width ?
 
 2108                     16 : (s->sps->
width - x0) >> hshift;
 
 2110             if (!cand_bottom_left) {
 
 2111                 size_max_y = y0 + ((16) << vshift) < s->sps->
height ?
 
 2112                     16 : (s->sps->
height - y0) >> vshift;
 
 2114             if (cand_bottom_left || cand_left || cand_up_left) {
 
 2118                                              ((-1) << hshift)) >> s->sps->
 
 2119                                             log2_min_pu_size)) + (((y0 +
 
 2124                                           * min_pu_width]).pred_flag ==
 
 2129                                          ((-1) << hshift)) >> s->sps->
 
 2130                                         log2_min_pu_size)) + (((y0 + ((j)
 
 2135                                       * min_pu_width]).pred_flag == 
PF_INTRA)) {
 
 2137                     while (j < size_max_x
 
 2140                                                  ((j) << hshift)) >> s->sps->
 
 2141                                                 log2_min_pu_size)) + (((y0 +
 
 2147                                               * min_pu_width]).pred_flag ==
 
 2150                     for (i = j; i > (j) - (j + 1); i--)
 
 2154                                                    1) << hshift)) >> s->sps->
 
 2155                                                 log2_min_pu_size)) + (((y0 +
 
 2161                                               * min_pu_width]).pred_flag ==
 
 2163                             top[i - 1] = top[i];
 
 2168                 while (j < size_max_x
 
 2171                                              ((j) << hshift)) >> s->sps->
 
 2172                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2177                                           * min_pu_width]).pred_flag ==
 
 2182                         for (i = j; i > (j) - (j + 1); i--)
 
 2187                                                     s->sps->log2_min_pu_size))
 
 2191                                                       s->sps->log2_min_pu_size))
 
 2193                                                   min_pu_width]).pred_flag ==
 
 2195                                 top[i - 1] = top[i];
 
 2197                         for (i = j; i > (j) - (j); i--)
 
 2202                                                     s->sps->log2_min_pu_size))
 
 2206                                                       s->sps->log2_min_pu_size))
 
 2208                                                   min_pu_width]).pred_flag ==
 
 2210                                 top[i - 1] = top[i];
 
 2216             if (cand_bottom_left || cand_left) {
 
 2217                 a = ((left[-1]) * 0x01010101U);
 
 2218                 for (i = 0; i < (0) + (size_max_y); i += 4)
 
 2221                                              ((-1) << hshift)) >> s->sps->
 
 2222                                             log2_min_pu_size)) + (((y0 +
 
 2227                                           * min_pu_width]).pred_flag ==
 
 2231                         a = ((left[i + 3]) * 0x01010101U);
 
 2234                 vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2238             if (!cand_bottom_left) {
 
 2240                 vec0 = (v16u8) __msa_fill_b(left[15]);
 
 2242                 ST_UB(vec0, (left + 16));
 
 2244             if (x0 != 0 && y0 != 0) {
 
 2245                 a = ((left[size_max_y - 1]) * 0x01010101U);
 
 2246                 for (i = (size_max_y - 1);
 
 2247                      i > (size_max_y - 1) - (size_max_y); i -= 4)
 
 2250                                              ((-1) << hshift)) >> s->sps->
 
 2251                                             log2_min_pu_size)) + (((y0 +
 
 2257                                           * min_pu_width]).pred_flag ==
 
 2261                         a = ((left[i - 3]) * 0x01010101U);
 
 2264                                          ((-1) << hshift)) >> s->sps->
 
 2265                                         log2_min_pu_size)) + (((y0 + ((-1)
 
 2270                                       * min_pu_width]).pred_flag == 
PF_INTRA))
 
 2272             } 
else if (x0 == 0) {
 
 2274                     uint32_t pix = ((0) * 0x01010101U);
 
 2275                     for (i = 0; i < (size_max_y); i += 4)
 
 2279                 a = ((left[size_max_y - 1]) * 0x01010101U);
 
 2280                 for (i = (size_max_y - 1);
 
 2281                      i > (size_max_y - 1) - (size_max_y); i -= 4)
 
 2284                                              ((-1) << hshift)) >> s->sps->
 
 2285                                             log2_min_pu_size)) + (((y0 +
 
 2291                                           * min_pu_width]).pred_flag ==
 
 2295                         a = ((left[i - 3]) * 0x01010101U);
 
 2299                 a = ((left[-1]) * 0x01010101U);
 
 2300                 for (i = 0; i < (0) + (size_max_x); i += 4)
 
 2303                                              ((i) << hshift)) >> s->sps->
 
 2304                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2309                                           * min_pu_width]).pred_flag ==
 
 2313                         a = ((top[i + 3]) * 0x01010101U);
 
 2318     if (!cand_bottom_left) {
 
 2320             vec0 = (v16u8) __msa_fill_b(left[15]);
 
 2322             ST_UB(vec0, (left + 16));
 
 2324         } 
else if (cand_up_left) {
 
 2325             vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2327             ST_UB2(vec0, vec0, left, 16);
 
 2330         } 
else if (cand_up) {
 
 2333             vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2335             ST_UB2(vec0, vec0, left, 16);
 
 2339         } 
else if (cand_up_right) {
 
 2340             vec0 = (v16u8) __msa_fill_b(top[16]);
 
 2346             ST_UB2(vec0, vec0, left, 16);
 
 2353             vec0 = (v16u8) __msa_ldi_b(128);
 
 2355             ST_UB2(vec0, vec0, top, 16);
 
 2356             ST_UB2(vec0, vec0, left, 16);
 
 2361         vec0 = (v16u8) __msa_fill_b(left[16]);
 
 2364     if (!cand_up_left) {
 
 2368         vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2371     if (!cand_up_right) {
 
 2372         vec0 = (v16u8) __msa_fill_b(top[15]);
 
 2373         ST_UB(vec0, (top + 16));
 
 2379     if (!s->sps->intra_smoothing_disabled_flag
 
 2380         && (c_idx == 0 || s->sps->chroma_format_idc == 3)) {
 
 2382             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
 
 2383             int min_dist_vert_hor =
 
 2384                 (((((int) (
mode - 26
U)) >=
 
 2385                    0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
 
 2386                  ((((int) (
mode - 10
U)) >=
 
 2387                    0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2388                  ? ((((int) (
mode - 10
U)) >=
 
 2389                      0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2390                  : ((((int) (
mode - 26
U)) >=
 
 2391                      0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
 
 2392             if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
 
 2393                 filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
 
 2394                 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
 
 2395                 for (i = 2 * 16 - 2; i >= 0; i--)
 
 2396                     filtered_left[i] = (left[i + 1] + 2 * left[i] +
 
 2397                                         left[i - 1] + 2) >> 2;
 
 2400                     (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
 
 2401                 for (i = 2 * 16 - 2; i >= 0; i--)
 
 2402                     filtered_top[i] = (top[i + 1] + 2 * top[i] +
 
 2403                                        top[i - 1] + 2) >> 2;
 
 2404                 left = filtered_left;
 
 2417                        (
uint8_t *) left, stride, 4, c_idx);
 
 2429     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 2430     v8i16 res0, res1, res2, res3;
 
 2431     v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
 
 2432     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
 2435     int hshift = s->sps->hshift[c_idx];
 
 2436     int vshift = s->sps->vshift[c_idx];
 
 2437     int size_in_luma_h = 32 << hshift;
 
 2438     int size_in_tbs_h = size_in_luma_h >> s->sps->log2_min_tb_size;
 
 2439     int size_in_luma_v = 32 << vshift;
 
 2440     int size_in_tbs_v = size_in_luma_v >> s->sps->log2_min_tb_size;
 
 2441     int x = x0 >> hshift;
 
 2442     int y = y0 >> vshift;
 
 2443     int x_tb = (x0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
 
 2444     int y_tb = (y0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
 
 2447         s->pps->min_tb_addr_zs[(y_tb) * (s->sps->tb_mask + 2) + (x_tb)];
 
 2452     int min_pu_width = s->sps->min_pu_width;
 
 2457     uint8_t left_array[2 * 32 + 1];
 
 2458     uint8_t filtered_left_array[2 * 32 + 1];
 
 2459     uint8_t top_array[2 * 32 + 1];
 
 2460     uint8_t filtered_top_array[2 * 32 + 1];
 
 2462     uint8_t *left = left_array + 1;
 
 2464     uint8_t *filtered_left = filtered_left_array + 1;
 
 2465     uint8_t *filtered_top = filtered_top_array + 1;
 
 2468         s->pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->sps->tb_mask) *
 
 2469                                (s->sps->tb_mask + 2) + (x_tb - 1)];
 
 2475         s->pps->min_tb_addr_zs[(y_tb - 1) * (s->sps->tb_mask + 2) +
 
 2476                                ((x_tb + size_in_tbs_h) & s->sps->tb_mask)];
 
 2478     int bottom_left_size =
 
 2479         (((y0 + 2 * size_in_luma_v) >
 
 2481                                                  2 * size_in_luma_v)) -
 
 2482          (y0 + size_in_luma_v)) >> vshift;
 
 2483     int top_right_size =
 
 2484         (((x0 + 2 * size_in_luma_h) >
 
 2485           (s->sps->
width) ? (s->sps->
width) : (x0 + 2 * size_in_luma_h)) -
 
 2486          (x0 + size_in_luma_h)) >> hshift;
 
 2488     if (s->pps->constrained_intra_pred_flag == 1) {
 
 2489         int size_in_luma_pu_v = ((size_in_luma_v) >> s->sps->log2_min_pu_size);
 
 2490         int size_in_luma_pu_h = ((size_in_luma_h) >> s->sps->log2_min_pu_size);
 
 2491         int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
 
 2492         int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
 
 2493         if (!size_in_luma_pu_h)
 
 2494             size_in_luma_pu_h++;
 
 2495         if (cand_bottom_left == 1 && on_pu_edge_x) {
 
 2496             int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
 
 2498                 ((y0 + size_in_luma_v) >> s->sps->log2_min_pu_size);
 
 2500                 ((size_in_luma_pu_v) >
 
 2501                  (s->sps->min_pu_height -
 
 2502                   y_bottom_pu) ? (s->sps->min_pu_height -
 
 2503                                   y_bottom_pu) : (size_in_luma_pu_v));
 
 2504             cand_bottom_left = 0;
 
 2505             for (i = 0; i < max; i += 2)
 
 2509                                        i) * min_pu_width]).pred_flag ==
 
 2512         if (cand_left == 1 && on_pu_edge_x) {
 
 2513             int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
 
 2514             int y_left_pu = ((y0) >> s->sps->log2_min_pu_size);
 
 2516                 ((size_in_luma_pu_v) >
 
 2517                  (s->sps->min_pu_height -
 
 2518                   y_left_pu) ? (s->sps->min_pu_height -
 
 2519                                 y_left_pu) : (size_in_luma_pu_v));
 
 2521             for (i = 0; i < max; i += 2)
 
 2525                                        i) * min_pu_width]).pred_flag ==
 
 2528         if (cand_up_left == 1) {
 
 2529             int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
 
 2530             int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
 
 2533                                  (y_top_pu) * min_pu_width]).pred_flag ==
 
 2536         if (cand_up == 1 && on_pu_edge_y) {
 
 2537             int x_top_pu = ((x0) >> s->sps->log2_min_pu_size);
 
 2538             int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
 
 2540                 ((size_in_luma_pu_h) >
 
 2541                  (s->sps->min_pu_width -
 
 2542                   x_top_pu) ? (s->sps->min_pu_width -
 
 2543                                x_top_pu) : (size_in_luma_pu_h));
 
 2545             for (i = 0; i < max; i += 2)
 
 2549                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2551         if (cand_up_right == 1 && on_pu_edge_y) {
 
 2552             int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
 
 2554                 ((x0 + size_in_luma_h) >> s->sps->log2_min_pu_size);
 
 2556                 ((size_in_luma_pu_h) >
 
 2557                  (s->sps->min_pu_width -
 
 2558                   x_right_pu) ? (s->sps->min_pu_width -
 
 2559                                  x_right_pu) : (size_in_luma_pu_h));
 
 2561             for (i = 0; i < max; i += 2)
 
 2565                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2567         vec0 = (v16u8) __msa_ldi_b(128);
 
 2569         ST_UB4(vec0, vec0, vec0, vec0, left, 16);
 
 2570         ST_UB4(vec0, vec0, vec0, vec0, top, 16);
 
 2575         left[-1] = src[(-1) + stride * (-1)];
 
 2579         LD_UB2(src - stride, 16, vec0, vec1);
 
 2580         ST_UB2(vec0, vec1, top, 16);
 
 2583     if (cand_up_right) {
 
 2584         LD_UB2(src - stride + 32, 16, vec0, vec1);
 
 2585         ST_UB2(vec0, vec1, (top + 32), 16);
 
 2588                 ((src[(32 + top_right_size - 1) + stride * (-1)]) *
 
 2590             for (i = 0; i < (32 - top_right_size); i += 4)
 
 2596         for (i = 0; i < 32; i++)
 
 2597             left[i] = src[(-1) + stride * (i)];
 
 2598     if (cand_bottom_left) {
 
 2599         for (i = 32; i < 32 + bottom_left_size; i++)
 
 2600             left[i] = src[(-1) + stride * (i)];
 
 2603                 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
 
 2605             for (i = 0; i < (32 - bottom_left_size); i += 4)
 
 2606                 ((((
union unaligned_32 *) (left + 32 + bottom_left_size +
 
 2611     if (s->pps->constrained_intra_pred_flag == 1) {
 
 2612         if (cand_bottom_left || cand_left || cand_up_left || cand_up
 
 2615                 x0 + ((2 * 32) << hshift) <
 
 2616                 s->sps->
width ? 2 * 32 : (s->sps->
width - x0) >> hshift;
 
 2618                 y0 + ((2 * 32) << vshift) <
 
 2619                 s->sps->
height ? 2 * 32 : (s->sps->
height - y0) >> vshift;
 
 2620             int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
 
 2621             if (!cand_up_right) {
 
 2622                 size_max_x = x0 + ((32) << hshift) < s->sps->
width ?
 
 2623                     32 : (s->sps->
width - x0) >> hshift;
 
 2625             if (!cand_bottom_left) {
 
 2626                 size_max_y = y0 + ((32) << vshift) < s->sps->
height ?
 
 2627                     32 : (s->sps->
height - y0) >> vshift;
 
 2629             if (cand_bottom_left || cand_left || cand_up_left) {
 
 2633                                              ((-1) << hshift)) >> s->sps->
 
 2634                                             log2_min_pu_size)) + (((y0 +
 
 2639                                           * min_pu_width]).pred_flag ==
 
 2644                                          ((-1) << hshift)) >> s->sps->
 
 2645                                         log2_min_pu_size)) + (((y0 + ((j)
 
 2650                                       * min_pu_width]).pred_flag == 
PF_INTRA)) {
 
 2652                     while (j < size_max_x
 
 2655                                                  ((j) << hshift)) >> s->sps->
 
 2656                                                 log2_min_pu_size)) + (((y0 +
 
 2662                                               * min_pu_width]).pred_flag ==
 
 2665                     for (i = j; i > (j) - (j + 1); i--)
 
 2669                                                    1) << hshift)) >> s->sps->
 
 2670                                                 log2_min_pu_size)) + (((y0 +
 
 2676                                               * min_pu_width]).pred_flag ==
 
 2678                             top[i - 1] = top[i];
 
 2683                 while (j < size_max_x
 
 2686                                              ((j) << hshift)) >> s->sps->
 
 2687                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2692                                           * min_pu_width]).pred_flag ==
 
 2697                         for (i = j; i > (j) - (j + 1); i--)
 
 2702                                                     s->sps->log2_min_pu_size))
 
 2706                                                       s->sps->log2_min_pu_size))
 
 2708                                                   min_pu_width]).pred_flag ==
 
 2710                                 top[i - 1] = top[i];
 
 2712                         for (i = j; i > (j) - (j); i--)
 
 2717                                                     s->sps->log2_min_pu_size))
 
 2721                                                       s->sps->log2_min_pu_size))
 
 2723                                                   min_pu_width]).pred_flag ==
 
 2725                                 top[i - 1] = top[i];
 
 2731             if (cand_bottom_left || cand_left) {
 
 2732                 a = ((left[-1]) * 0x01010101U);
 
 2733                 for (i = 0; i < (0) + (size_max_y); i += 4)
 
 2736                                              ((-1) << hshift)) >> s->sps->
 
 2737                                             log2_min_pu_size)) + (((y0 +
 
 2742                                           * min_pu_width]).pred_flag ==
 
 2746                         a = ((left[i + 3]) * 0x01010101U);
 
 2749                 vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2751                 ST_UB2(vec0, vec0, left, 16);
 
 2753             if (!cand_bottom_left) {
 
 2754                 vec0 = (v16u8) __msa_fill_b(left[31]);
 
 2756                 ST_UB2(vec0, vec0, (left + 32), 16);
 
 2758             if (x0 != 0 && y0 != 0) {
 
 2759                 a = ((left[size_max_y - 1]) * 0x01010101U);
 
 2760                 for (i = (size_max_y - 1);
 
 2761                      i > (size_max_y - 1) - (size_max_y); i -= 4)
 
 2764                                              ((-1) << hshift)) >> s->sps->
 
 2765                                             log2_min_pu_size)) + (((y0 +
 
 2771                                           * min_pu_width]).pred_flag ==
 
 2775                         a = ((left[i - 3]) * 0x01010101U);
 
 2778                                          ((-1) << hshift)) >> s->sps->
 
 2779                                         log2_min_pu_size)) + (((y0 + ((-1)
 
 2784                                       * min_pu_width]).pred_flag == 
PF_INTRA))
 
 2786             } 
else if (x0 == 0) {
 
 2788                     uint32_t pix = ((0) * 0x01010101U);
 
 2789                     for (i = 0; i < (size_max_y); i += 4)
 
 2793                 a = ((left[size_max_y - 1]) * 0x01010101U);
 
 2794                 for (i = (size_max_y - 1);
 
 2795                      i > (size_max_y - 1) - (size_max_y); i -= 4)
 
 2798                                              ((-1) << hshift)) >> s->sps->
 
 2799                                             log2_min_pu_size)) + (((y0 +
 
 2805                                           * min_pu_width]).pred_flag ==
 
 2809                         a = ((left[i - 3]) * 0x01010101U);
 
 2813                 a = ((left[-1]) * 0x01010101U);
 
 2814                 for (i = 0; i < (0) + (size_max_x); i += 4)
 
 2817                                              ((i) << hshift)) >> s->sps->
 
 2818                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2823                                           * min_pu_width]).pred_flag ==
 
 2827                         a = ((top[i + 3]) * 0x01010101U);
 
 2832     if (!cand_bottom_left) {
 
 2834             vec0 = (v16u8) __msa_fill_b(left[31]);
 
 2836             ST_UB2(vec0, vec0, (left + 32), 16);
 
 2837         } 
else if (cand_up_left) {
 
 2838             vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2840             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
 
 2843         } 
else if (cand_up) {
 
 2846             vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2848             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
 
 2852         } 
else if (cand_up_right) {
 
 2853             vec0 = (v16u8) __msa_fill_b(top[32]);
 
 2855             ST_UB2(vec0, vec0, top, 16);
 
 2859             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
 
 2867             vec0 = (v16u8) __msa_ldi_b(128);
 
 2869             ST_UB4(vec0, vec0, vec0, vec0, top, 16);
 
 2870             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
 
 2875         vec0 = (v16u8) __msa_fill_b(left[32]);
 
 2877         ST_UB2(vec0, vec0, left, 16);
 
 2879     if (!cand_up_left) {
 
 2883         vec0 = (v16u8) __msa_fill_b(left[-1]);
 
 2885         ST_UB2(vec0, vec0, top, 16);
 
 2887     if (!cand_up_right) {
 
 2888         vec0 = (v16u8) __msa_fill_b(top[31]);
 
 2890         ST_UB2(vec0, vec0, (top + 32), 16);
 
 2896     if (!s->sps->intra_smoothing_disabled_flag
 
 2897         && (c_idx == 0 || s->sps->chroma_format_idc == 3)) {
 
 2899             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
 
 2900             int min_dist_vert_hor =
 
 2901                 (((((int) (
mode - 26
U)) >=
 
 2902                    0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
 
 2903                  ((((int) (
mode - 10
U)) >=
 
 2904                    0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2905                  ? ((((int) (
mode - 10
U)) >=
 
 2906                      0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2907                  : ((((int) (
mode - 26
U)) >=
 
 2908                      0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
 
 2909             if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
 
 2910                 int threshold = 1 << (8 - 5);
 
 2911                 if (s->sps->sps_strong_intra_smoothing_enable_flag
 
 2913                     && ((top[-1] + top[63] - 2 * top[31]) >=
 
 2914                         0 ? (top[-1] + top[63] -
 
 2915                              2 * top[31]) : (-(top[-1] + top[63] -
 
 2916                                                2 * top[31]))) < threshold
 
 2917                     && ((left[-1] + left[63] - 2 * left[31]) >=
 
 2918                         0 ? (left[-1] + left[63] -
 
 2919                              2 * left[31]) : (-(left[-1] + left[63] -
 
 2920                                                 2 * left[31]))) < threshold) {
 
 2923                     filtered_top[-1] = top[-1];
 
 2924                     filtered_top[63] = top[63];
 
 2927                     for (i = 0; i < 63; i++) {
 
 2929                             ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
 
 2932                     tmp0 = __msa_fill_h(top[-1]);
 
 2933                     tmp1 = __msa_fill_h(top[63]);
 
 2935                     tmp2 = mul_val0 - 8;
 
 2936                     tmp3 = mul_val0 - 16;
 
 2937                     tmp4 = mul_val0 - 24;
 
 2938                     tmp5 = mul_val1 + 8;
 
 2939                     tmp6 = mul_val1 + 16;
 
 2940                     tmp7 = mul_val1 + 24;
 
 2942                     res0 = mul_val0 * tmp0;
 
 2946                     res0 += mul_val1 * tmp1;
 
 2947                     res1 += tmp5 * tmp1;
 
 2948                     res2 += tmp6 * tmp1;
 
 2949                     res3 += tmp7 * tmp1;
 
 2951                     res0 = __msa_srari_h(res0, 6);
 
 2952                     res1 = __msa_srari_h(res1, 6);
 
 2953                     res2 = __msa_srari_h(res2, 6);
 
 2954                     res3 = __msa_srari_h(res3, 6);
 
 2956                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 2957                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 2959                     ST_UB2(vec0, vec1, filtered_top, 16);
 
 2961                     res0 = mul_val0 - 32;
 
 2962                     tmp2 = mul_val0 - 40;
 
 2963                     tmp3 = mul_val0 - 48;
 
 2964                     tmp4 = mul_val0 - 56;
 
 2965                     res3 = mul_val1 + 32;
 
 2966                     tmp5 = mul_val1 + 40;
 
 2967                     tmp6 = mul_val1 + 48;
 
 2968                     tmp7 = mul_val1 + 56;
 
 2973                     res0 += res3 * tmp1;
 
 2975                     res1 += tmp5 * tmp1;
 
 2976                     res2 += tmp6 * tmp1;
 
 2977                     res3 += tmp7 * tmp1;
 
 2979                     res0 = __msa_srari_h(res0, 6);
 
 2980                     res1 = __msa_srari_h(res1, 6);
 
 2981                     res2 = __msa_srari_h(res2, 6);
 
 2982                     res3 = __msa_srari_h(res3, 6);
 
 2984                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 2985                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 2987                     ST_UB2(vec0, vec1, (filtered_top + 32), 16);
 
 2989                     filtered_top[63] = top[63];
 
 2991                     tmp0 = __msa_fill_h(left[-1]);
 
 2992                     tmp1 = __msa_fill_h(left[63]);
 
 2994                     tmp2 = mul_val0 - 8;
 
 2995                     tmp3 = mul_val0 - 16;
 
 2996                     tmp4 = mul_val0 - 24;
 
 2997                     tmp5 = mul_val1 + 8;
 
 2998                     tmp6 = mul_val1 + 16;
 
 2999                     tmp7 = mul_val1 + 24;
 
 3001                     res0 = mul_val0 * tmp0;
 
 3005                     res0 += mul_val1 * tmp1;
 
 3006                     res1 += tmp5 * tmp1;
 
 3007                     res2 += tmp6 * tmp1;
 
 3008                     res3 += tmp7 * tmp1;
 
 3010                     res0 = __msa_srari_h(res0, 6);
 
 3011                     res1 = __msa_srari_h(res1, 6);
 
 3012                     res2 = __msa_srari_h(res2, 6);
 
 3013                     res3 = __msa_srari_h(res3, 6);
 
 3015                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 3016                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 3018                     ST_UB2(vec0, vec1, left, 16);
 
 3020                     res0 = mul_val0 - 32;
 
 3021                     tmp2 = mul_val0 - 40;
 
 3022                     tmp3 = mul_val0 - 48;
 
 3023                     tmp4 = mul_val0 - 56;
 
 3024                     res3 = mul_val1 + 32;
 
 3025                     tmp5 = mul_val1 + 40;
 
 3026                     tmp6 = mul_val1 + 48;
 
 3027                     tmp7 = mul_val1 + 56;
 
 3032                     res0 += res3 * tmp1;
 
 3034                     res1 += tmp5 * tmp1;
 
 3035                     res2 += tmp6 * tmp1;
 
 3036                     res3 += tmp7 * tmp1;
 
 3038                     res0 = __msa_srari_h(res0, 6);
 
 3039                     res1 = __msa_srari_h(res1, 6);
 
 3040                     res2 = __msa_srari_h(res2, 6);
 
 3041                     res3 = __msa_srari_h(res3, 6);
 
 3043                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 3044                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 3046                     ST_UB2(vec0, vec1, (left + 32), 16);
 
 3052                     filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
 
 3053                     filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
 
 3054                     for (i = 2 * 32 - 2; i >= 0; i--)
 
 3055                         filtered_left[i] = (left[i + 1] + 2 * left[i] +
 
 3056                                             left[i - 1] + 2) >> 2;
 
 3059                         (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
 
 3060                     for (i = 2 * 32 - 2; i >= 0; i--)
 
 3061                         filtered_top[i] = (top[i + 1] + 2 * top[i] +
 
 3062                                            top[i - 1] + 2) >> 2;
 
 3063                     left = filtered_left;
 
 3077                        (
uint8_t *) left, stride, 5, c_idx);
 
const char const char void * val
 
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
 
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
#define MUL2(in0, in1, in2, in3, out0, out1)
 
static const int8_t intra_pred_angle_up[17]
 
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
#define SPLATI_H2_SH(...)
 
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
 
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
 
#define UNPCK_UB_SH(in, out0, out1)
 
#define CLIP_SH_0_255(in)
 
#define SPLATI_H4_SH(...)
 
#define SLDI_B4_0_SH(...)
 
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
 
#define CLIP_SH2_0_255(in0, in1)
 
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
 
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,res0, res1, mul_val_b0, mul_val_b1, round)
 
void(* pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx)
 
static const int8_t intra_pred_angle_low[16]
 
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
 
#define SW4(in0, in1, in2, in3, pdst, stride)
 
static const uint8_t offset[127][2]
 
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
 
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
 
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
#define ST2x4_UB(in, stidx, pdst, stride)
 
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
 
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
 
int linesize[AV_NUM_DATA_POINTERS]
For video, size in bytes of each picture line. 
 
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
void(* pred_angular[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int mode)
 
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
 
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
 
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
#define ADD2(in0, in1, in2, in3, out0, out1)
 
#define INSERT_W2_SB(...)
 
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
#define SD4(in0, in1, in2, in3, pdst, stride)
 
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
 
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
 
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
 
HEVCLocalContext * HEVClc
 
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
 
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
 
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes. 
 
#define ST4x8_UB(in0, in1, pdst, stride)
 
#define INSERT_D2_UB(...)
 
GLint GLenum GLboolean GLsizei stride
 
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
 
#define SUB2(in0, in1, in2, in3, out0, out1)
 
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
 
#define ST8x4_UB(in0, in1, pdst, stride)
 
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
 
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
 
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
 
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
 
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
 
void(* pred_planar[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride)
 
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
 
#define ST4x2_UB(in, pdst, stride)
 
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)