26 -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
30 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34 mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35 res0, res1, mul_val_b0, mul_val_b1, round) \
37 v8i16 res0_m, res1_m, res2_m, res3_m; \
39 MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40 mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
42 res0_m += mul_val_h1 * tmp0; \
43 res1_m += mul_val_h3 * tmp0; \
44 res2_m += mul_val_h1 * tmp0; \
45 res3_m += mul_val_h3 * tmp0; \
47 res0_m += mul_val_b0 * src0_r; \
48 res1_m += mul_val_b0 * src0_l; \
49 res2_m += (mul_val_b0 - 1) * src0_r; \
50 res3_m += (mul_val_b0 - 1) * src0_l; \
52 res0_m += mul_val_b1 * tmp1; \
53 res1_m += mul_val_b1 * tmp1; \
54 res2_m += (mul_val_b1 + 1) * tmp1; \
55 res3_m += (mul_val_b1 + 1) * tmp1; \
57 SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58 PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
68 v8i16 vec0, vec1, vec2;
71 src_data =
LW(src_top);
72 SW4(src_data, src_data, src_data, src_data, dst, stride);
75 src_data =
LW(src_left);
77 vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
79 vec0 = __msa_fill_h(src_left[-1]);
80 vec1 = __msa_fill_h(src_top[0]);
82 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
88 for (col = 0; col < 4; col++) {
89 dst[stride * col] = (
uint8_t) vec2[col];
101 uint16_t val0, val1, val2, val3;
103 v8i16 vec0, vec1, vec2;
106 src_data1 =
LD(src_top);
108 for (row = 8; row--;) {
109 SD(src_data1, tmp_dst);
114 src_data1 =
LD(src_left);
116 vec2 = (v8i16) __msa_insert_d((v2i64)
zero, 0, src_data1);
118 vec0 = __msa_fill_h(src_left[-1]);
119 vec1 = __msa_fill_h(src_top[0]);
121 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
158 v8i16 vec0, vec1, vec2, vec3;
160 src =
LD_UB(src_top);
162 for (row = 16; row--;) {
168 src =
LD_UB(src_left);
170 vec0 = __msa_fill_h(src_left[-1]);
171 vec1 = __msa_fill_h(src_top[0]);
174 SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
179 ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
182 src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
184 for (col = 0; col < 16; col++) {
185 dst[stride * col] = src[col];
195 uint32_t val0, val1, val2, val3;
197 v8i16 src0_r, src_top_val, src_left_val;
200 val0 = src_left[0] * 0x01010101;
201 val1 = src_left[1] * 0x01010101;
202 val2 = src_left[2] * 0x01010101;
203 val3 = src_left[3] * 0x01010101;
204 SW4(val0, val1, val2, val3, dst, stride);
208 src0 = (v16i8) __msa_insert_w((v4i32)
src0, 0, val0);
209 src_top_val = __msa_fill_h(src_top[-1]);
210 src_left_val = __msa_fill_h(src_left[0]);
212 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
214 src0_r -= src_top_val;
216 src0_r += src_left_val;
218 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219 val0 = __msa_copy_s_w((v4i32) src0, 0);
229 uint64_t val0, val1, val2, val3;
231 v8i16 src0_r, src_top_val, src_left_val;
234 val0 = src_left[0] * 0x0101010101010101;
235 val1 = src_left[1] * 0x0101010101010101;
236 val2 = src_left[2] * 0x0101010101010101;
237 val3 = src_left[3] * 0x0101010101010101;
238 SD4(val0, val1, val2, val3, dst, stride);
240 val0 = src_left[4] * 0x0101010101010101;
241 val1 = src_left[5] * 0x0101010101010101;
242 val2 = src_left[6] * 0x0101010101010101;
243 val3 = src_left[7] * 0x0101010101010101;
244 SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
248 src0 = (v16i8) __msa_insert_d((v2i64)
src0, 0, val0);
249 src_top_val = __msa_fill_h(src_top[-1]);
250 src_left_val = __msa_fill_h(src_left[0]);
252 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
254 src0_r -= src_top_val;
256 src0_r += src_left_val;
258 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259 val0 = __msa_copy_s_d((v2i64) src0, 0);
271 uint8_t inp0, inp1, inp2, inp3;
273 v8i16 src0_r, src0_l, src_left_val, src_top_val;
275 src_left_val = __msa_fill_h(src_left[0]);
277 for (row = 4; row--;) {
284 src0 = __msa_fill_b(inp0);
285 src1 = __msa_fill_b(inp1);
286 src2 = __msa_fill_b(inp2);
287 src3 = __msa_fill_b(inp3);
289 ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
294 src0 =
LD_SB(src_top);
295 src_top_val = __msa_fill_h(src_top[-1]);
298 SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
303 ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
305 src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
315 uint8_t inp0, inp1, inp2, inp3;
318 for (row = 0; row < 8; row++) {
319 inp0 = src_left[row * 4];
320 inp1 = src_left[row * 4 + 1];
321 inp2 = src_left[row * 4 + 2];
322 inp3 = src_left[row * 4 + 3];
324 src0 = __msa_fill_b(inp0);
325 src1 = __msa_fill_b(inp1);
326 src2 = __msa_fill_b(inp2);
327 src3 = __msa_fill_b(inp3);
329 ST_SB2(src0, src0, dst, 16);
331 ST_SB2(src1, src1, dst, 16);
333 ST_SB2(src2, src2, dst, 16);
335 ST_SB2(src3, src3, dst, 16);
346 uint32_t addition = 0;
347 uint32_t val0, val1, val2;
351 v8u16 sum, vec0, vec1;
356 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357 sum = (v8u16) __msa_hadd_u_w(sum, sum);
358 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359 sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360 addition = __msa_copy_u_w((v4i32) sum, 0);
361 store = (v16u8) __msa_fill_b(addition);
362 val0 = __msa_copy_u_w((v4i32) store, 0);
363 SW4(val0, val0, val0, val0, dst, stride)
366 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
372 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374 val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375 store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376 val0 = __msa_copy_u_w((v4i32) store, 0);
385 ADD2(val0, addition, val1, addition, val0, val1);
395 tmp_dst[stride * 1] = val0;
396 tmp_dst[stride * 2] = val1;
397 tmp_dst[stride * 3] = val2;
407 uint32_t row, col,
val;
408 uint32_t addition = 0;
412 v8u16 sum, vec0, vec1;
418 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419 sum = (v8u16) __msa_hadd_u_w(sum, sum);
420 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423 sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424 addition = __msa_copy_u_w((v4i32) sum, 0);
425 store = (v16u8) __msa_fill_b(addition);
426 val0 = __msa_copy_u_d((v2i64) store, 0);
428 for (row = 8; row--;) {
434 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
439 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442 store = (v16u8) __msa_insert_b((v16i8) store, 0,
val);
443 val0 = __msa_copy_u_d((v2i64) store, 0);
447 src = (v16u8) __msa_insert_d((v2i64)
src, 0, val0);
448 vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8)
src);
449 vec0 = (v8u16) __msa_fill_h(addition);
452 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
454 for (col = 1; col < 8; col++) {
455 tmp_dst[stride * col] = vec1[col];
466 uint32_t row, col,
val;
467 uint32_t addition = 0;
468 v16u8 src_above1, store, src_left1;
469 v8u16 sum, sum_above, sum_left;
470 v8u16 vec0, vec1, vec2;
473 src_above1 =
LD_UB(src_top);
474 src_left1 =
LD_UB(src_left);
476 HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477 sum = sum_above + sum_left;
478 sum = (v8u16) __msa_hadd_u_w(sum, sum);
479 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482 sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483 addition = __msa_copy_u_w((v4i32) sum, 0);
484 store = (v16u8) __msa_fill_b(addition);
486 for (row = 16; row--;) {
492 vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
494 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
496 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
498 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500 store = (v16u8) __msa_insert_b((v16i8) store, 0,
val);
501 ST_UB(store, tmp_dst);
504 vec0 = (v8u16) __msa_fill_h(addition);
506 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
508 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
510 for (col = 1; col < 16; col++) {
511 tmp_dst[stride * col] = store[col];
521 v16u8 src_above1, src_above2, store, src_left1, src_left2;
522 v8u16 sum_above1, sum_above2;
523 v8u16 sum_left1, sum_left2;
524 v8u16 sum, sum_above, sum_left;
526 LD_UB2(src_top, 16, src_above1, src_above2);
527 LD_UB2(src_left, 16, src_left1, src_left2);
528 HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529 HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530 sum_above = sum_above1 + sum_above2;
531 sum_left = sum_left1 + sum_left2;
532 sum = sum_above + sum_left;
533 sum = (v8u16) __msa_hadd_u_w(sum, sum);
534 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537 sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538 store = (v16u8) __msa_splati_b((v16i8) sum, 0);
540 for (row = 16; row--;) {
541 ST_UB2(store, store, dst, 16);
543 ST_UB2(store, store, dst, 16);
553 v16i8 src_vec0, src_vec1;
554 v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555 v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556 v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
562 mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
564 src_vec0 = (v16i8) __msa_insert_w((v4i32)
zero, 0,
src0);
565 src_vec1 = (v16i8) __msa_insert_w((v4i32)
zero, 0,
src1);
567 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568 SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
570 tmp0 = __msa_fill_h(src_top[4]);
571 tmp1 = __msa_fill_h(src_left[4]);
573 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574 res0, res1, res2, res3);
576 res0 += mul_val1 * tmp0;
577 res1 += mul_val1 * tmp0;
578 res2 += mul_val1 * tmp0;
579 res3 += mul_val1 * tmp0;
581 res0 += 3 * src_vec0_r;
582 res1 += 2 * src_vec0_r;
591 src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592 ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
600 v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601 v8i16 src_vec0_r, src_vec1_r;
602 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604 v8i16 tmp0, tmp1, tmp2;
605 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606 v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
612 src_vec0 = (v16i8) __msa_insert_d((v2i64)
zero, 0,
src0);
613 src_vec1 = (v16i8) __msa_insert_d((v2i64)
zero, 0,
src1);
615 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616 SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617 SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
619 tmp0 = __msa_fill_h(src_top[8]);
620 tmp1 = __msa_fill_h(src_left[8]);
622 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623 res0, res1, res2, res3);
624 MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625 res4, res5, res6, res7);
627 tmp2 = mul_val1 * tmp0;
637 res0 += 7 * src_vec0_r;
638 res1 += 6 * src_vec0_r;
639 res2 += 5 * src_vec0_r;
640 res3 += 4 * src_vec0_r;
641 res4 += 3 * src_vec0_r;
642 res5 += 2 * src_vec0_r;
656 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657 src_vec0, src_vec1, src_vec2, src_vec3);
659 ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
667 v8i16 src0_r, src1_r, src0_l, src1_l;
669 v8i16 res0, res1, tmp0, tmp1;
670 v8i16 mul_val2, mul_val3;
671 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
672 v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 src0 =
LD_UB(src_top);
675 src1 =
LD_UB(src_left);
680 mul_val2 = mul_val0 - 8;
681 mul_val3 = mul_val1 + 8;
683 tmp0 = __msa_fill_h(src_top[16]);
684 tmp1 = __msa_fill_h(src_left[16]);
688 mul_val0, mul_val1, mul_val2, mul_val3,
689 res0, res1, 15, 1, 5);
690 ST_SH2(res0, res1, dst, stride);
695 mul_val0, mul_val1, mul_val2, mul_val3,
696 res0, res1, 13, 3, 5);
697 ST_SH2(res0, res1, dst, stride);
702 mul_val0, mul_val1, mul_val2, mul_val3,
703 res0, res1, 11, 5, 5);
704 ST_SH2(res0, res1, dst, stride);
709 mul_val0, mul_val1, mul_val2, mul_val3,
710 res0, res1, 9, 7, 5);
711 ST_SH2(res0, res1, dst, stride);
716 mul_val0, mul_val1, mul_val2, mul_val3,
717 res0, res1, 7, 9, 5);
718 ST_SH2(res0, res1, dst, stride);
723 mul_val0, mul_val1, mul_val2, mul_val3,
724 res0, res1, 5, 11, 5);
725 ST_SH2(res0, res1, dst, stride);
730 mul_val0, mul_val1, mul_val2, mul_val3,
731 res0, res1, 3, 13, 5);
732 ST_SH2(res0, res1, dst, stride);
737 mul_val0, mul_val1, mul_val2, mul_val3,
738 res0, res1, 1, 15, 5);
739 ST_SH2(res0, res1, dst, stride);
748 v8i16 src0_r, src1_r, src0_l, src1_l;
749 v8i16 vec0, vec1, res0, res1;
751 v8i16 mul_val2, mul_val3;
752 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
753 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 tmp0 = __msa_fill_h(src_top[32 - offset]);
756 tmp1 = __msa_fill_h(src_left[32]);
758 src0 =
LD_SB(src_top);
759 src1 =
LD_SB(src_left);
766 mul_val2 = mul_val0 - 8;
767 mul_val3 = mul_val1 + 8;
771 mul_val0, mul_val1, mul_val2, mul_val3,
772 res0, res1, 31, 1, 6);
773 ST_SH2(res0, res1, dst, stride);
778 mul_val0, mul_val1, mul_val2, mul_val3,
779 res0, res1, 29, 3, 6);
780 ST_SH2(res0, res1, dst, stride);
785 mul_val0, mul_val1, mul_val2, mul_val3,
786 res0, res1, 27, 5, 6);
787 ST_SH2(res0, res1, dst, stride);
792 mul_val0, mul_val1, mul_val2, mul_val3,
793 res0, res1, 25, 7, 6);
794 ST_SH2(res0, res1, dst, stride);
799 mul_val0, mul_val1, mul_val2, mul_val3,
800 res0, res1, 23, 9, 6);
801 ST_SH2(res0, res1, dst, stride);
806 mul_val0, mul_val1, mul_val2, mul_val3,
807 res0, res1, 21, 11, 6);
808 ST_SH2(res0, res1, dst, stride);
813 mul_val0, mul_val1, mul_val2, mul_val3,
814 res0, res1, 19, 13, 6);
815 ST_SH2(res0, res1, dst, stride);
820 mul_val0, mul_val1, mul_val2, mul_val3,
821 res0, res1, 17, 15, 6);
822 ST_SH2(res0, res1, dst, stride);
831 v8i16 src0_r, src1_r, src0_l, src1_l;
832 v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
833 v8i16 mul_val2, mul_val3;
834 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
835 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 tmp0 = __msa_fill_h(src_top[32 - offset]);
838 tmp1 = __msa_fill_h(src_left[16]);
840 src0 =
LD_SB(src_top);
841 src1 =
LD_SB(src_left);
848 mul_val2 = mul_val0 - 8;
849 mul_val3 = mul_val1 + 8;
853 mul_val0, mul_val1, mul_val2, mul_val3,
854 res0, res1, 15, 17, 6);
855 ST_SH2(res0, res1, dst, stride);
860 mul_val0, mul_val1, mul_val2, mul_val3,
861 res0, res1, 13, 19, 6);
862 ST_SH2(res0, res1, dst, stride);
867 mul_val0, mul_val1, mul_val2, mul_val3,
868 res0, res1, 11, 21, 6);
869 ST_SH2(res0, res1, dst, stride);
874 mul_val0, mul_val1, mul_val2, mul_val3,
875 res0, res1, 9, 23, 6);
876 ST_SH2(res0, res1, dst, stride);
881 mul_val0, mul_val1, mul_val2, mul_val3,
882 res0, res1, 7, 25, 6);
883 ST_SH2(res0, res1, dst, stride);
888 mul_val0, mul_val1, mul_val2, mul_val3,
889 res0, res1, 5, 27, 6);
890 ST_SH2(res0, res1, dst, stride);
895 mul_val0, mul_val1, mul_val2, mul_val3,
896 res0, res1, 3, 29, 6);
897 ST_SH2(res0, res1, dst, stride);
902 mul_val0, mul_val1, mul_val2, mul_val3,
903 res0, res1, 1, 31, 6);
904 ST_SH2(res0, res1, dst, stride);
913 (dst + 16), stride, 16);
919 (dst + 16), stride, 16);
928 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930 uint8_t *ref_tmp = ref_array + 4;
933 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
934 int32_t idx2, fact_val2, idx3, fact_val3;
938 v16i8 top0, top1, top2, top3;
941 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
942 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
945 inv_angle_val = inv_angle[mode - 18];
950 if (angle < 0 && last < -1) {
951 inv_angle_val = inv_angle[mode - 18];
956 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
957 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
958 ref_tmp[h_cnt] = src_left[
offset];
964 idx0 = angle_loop >> 5;
965 fact_val0 = angle_loop & 31;
968 idx1 = angle_loop >> 5;
969 fact_val1 = angle_loop & 31;
972 idx2 = angle_loop >> 5;
973 fact_val2 = angle_loop & 31;
976 idx3 = angle_loop >> 5;
977 fact_val3 = angle_loop & 31;
979 top0 =
LD_SB(ref + idx0 + 1);
980 top1 =
LD_SB(ref + idx1 + 1);
981 top2 =
LD_SB(ref + idx2 + 1);
982 top3 =
LD_SB(ref + idx3 + 1);
984 fact0 = __msa_fill_h(fact_val0);
985 fact1 = __msa_fill_h(32 - fact_val0);
987 fact2 = __msa_fill_h(fact_val1);
988 fact3 = __msa_fill_h(32 - fact_val1);
990 fact4 = __msa_fill_h(fact_val2);
991 fact5 = __msa_fill_h(32 - fact_val2);
993 fact6 = __msa_fill_h(fact_val3);
994 fact7 = __msa_fill_h(32 - fact_val3);
996 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
997 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
998 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
999 diff0, diff2, diff4, diff6);
1000 SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1001 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1002 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1003 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1005 diff1 += diff0 * fact1;
1006 diff3 += diff2 * fact3;
1009 dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1010 ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
1019 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1020 uint8_t ref_array[3 * 32 + 4];
1021 uint8_t *ref_tmp = ref_array + 8;
1023 const uint8_t *src_left_tmp = src_left - 1;
1025 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1026 int32_t idx2, fact_val2, idx3, fact_val3;
1028 int32_t inv_angle_val, inv_angle_val_loop;
1030 v16i8 top0, top1, top2, top3;
1031 v16u8 dst_val0, dst_val1;
1032 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1033 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036 inv_angle_val = inv_angle[mode - 18];
1037 last = (angle) >> 2;
1042 inv_angle_val_loop = inv_angle_val * last;
1048 SW(tmp1, ref_tmp + 4);
1049 SW(tmp2, ref_tmp + 8);
1051 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1052 offset = (inv_angle_val_loop + 128) >> 8;
1053 ref_tmp[h_cnt] = src_left_tmp[
offset];
1054 inv_angle_val_loop += inv_angle_val;
1059 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1060 idx0 = (angle_loop) >> 5;
1061 fact_val0 = (angle_loop) & 31;
1062 angle_loop += angle;
1064 idx1 = (angle_loop) >> 5;
1065 fact_val1 = (angle_loop) & 31;
1066 angle_loop += angle;
1068 idx2 = (angle_loop) >> 5;
1069 fact_val2 = (angle_loop) & 31;
1070 angle_loop += angle;
1072 idx3 = (angle_loop) >> 5;
1073 fact_val3 = (angle_loop) & 31;
1074 angle_loop += angle;
1076 top0 =
LD_SB(ref + idx0 + 1);
1077 top1 =
LD_SB(ref + idx1 + 1);
1078 top2 =
LD_SB(ref + idx2 + 1);
1079 top3 =
LD_SB(ref + idx3 + 1);
1081 fact0 = __msa_fill_h(fact_val0);
1082 fact1 = __msa_fill_h(32 - fact_val0);
1083 fact2 = __msa_fill_h(fact_val1);
1084 fact3 = __msa_fill_h(32 - fact_val1);
1085 fact4 = __msa_fill_h(fact_val2);
1086 fact5 = __msa_fill_h(32 - fact_val2);
1087 fact6 = __msa_fill_h(fact_val3);
1088 fact7 = __msa_fill_h(32 - fact_val3);
1095 SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1096 SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1097 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1098 diff1, diff3, diff5, diff7);
1100 diff1 += diff0 * fact1;
1101 diff3 += diff2 * fact3;
1102 diff5 += diff4 * fact5;
1103 diff7 += diff6 * fact7;
1106 PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1107 ST8x4_UB(dst_val0, dst_val1, dst, stride);
1118 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1119 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1120 int32_t idx2, fact_val2, idx3, fact_val3;
1123 int32_t inv_angle_val, inv_angle_val_loop;
1124 uint8_t ref_array[3 * 32 + 4];
1125 uint8_t *ref_tmp = ref_array + 16;
1127 const uint8_t *src_left_tmp = src_left - 1;
1129 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1130 v16i8 dst0, dst1, dst2, dst3;
1131 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1132 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1133 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136 inv_angle_val = inv_angle[mode - 18];
1142 inv_angle_val_loop = inv_angle_val * last;
1145 tmp0 =
LW(ref + 16);
1146 ST_UB(top0, ref_tmp);
1147 SW(tmp0, ref_tmp + 16);
1149 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1150 offset = (inv_angle_val_loop + 128) >> 8;
1151 ref_tmp[h_cnt] = src_left_tmp[
offset];
1152 inv_angle_val_loop += inv_angle_val;
1157 for (v_cnt = 4; v_cnt--;) {
1158 idx0 = (angle_loop) >> 5;
1159 fact_val0 = (angle_loop) & 31;
1160 angle_loop += angle;
1162 idx1 = (angle_loop) >> 5;
1163 fact_val1 = (angle_loop) & 31;
1164 angle_loop += angle;
1166 idx2 = (angle_loop) >> 5;
1167 fact_val2 = (angle_loop) & 31;
1168 angle_loop += angle;
1170 idx3 = (angle_loop) >> 5;
1171 fact_val3 = (angle_loop) & 31;
1172 angle_loop += angle;
1174 LD_UB2(ref + idx0 + 1, 16, top0, top1);
1175 LD_UB2(ref + idx1 + 1, 16, top2, top3);
1176 LD_UB2(ref + idx2 + 1, 16, top4, top5);
1177 LD_UB2(ref + idx3 + 1, 16, top6, top7);
1179 fact0 = __msa_fill_h(fact_val0);
1180 fact1 = __msa_fill_h(32 - fact_val0);
1181 fact2 = __msa_fill_h(fact_val1);
1182 fact3 = __msa_fill_h(32 - fact_val1);
1183 fact4 = __msa_fill_h(fact_val2);
1184 fact5 = __msa_fill_h(32 - fact_val2);
1185 fact6 = __msa_fill_h(fact_val3);
1186 fact7 = __msa_fill_h(32 - fact_val3);
1188 SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1189 SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1199 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1200 diff2, diff3, diff6, diff7);
1201 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1202 diff10, diff11, diff14, diff15);
1204 diff2 += diff0 * fact1;
1205 diff3 += diff1 * fact1;
1206 diff6 += diff4 * fact3;
1207 diff7 += diff5 * fact3;
1208 diff10 += diff8 * fact5;
1209 diff11 += diff9 * fact5;
1210 diff14 += diff12 * fact7;
1211 diff15 += diff13 * fact7;
1215 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1216 dst0, dst1, dst2, dst3);
1217 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1228 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1229 uint8_t ref_array[3 * 32 + 4];
1232 const uint8_t *src_left_tmp = src_left - 1;
1233 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1234 int32_t tmp0, tmp1, tmp2, tmp3;
1236 int32_t inv_angle_val, inv_angle_val_loop;
1238 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1239 v16i8 dst0, dst1, dst2, dst3;
1240 v8i16 fact0, fact1, fact2, fact3;
1241 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1242 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1244 ref_tmp = ref_array + 32;
1247 inv_angle_val = inv_angle[mode - 18];
1253 inv_angle_val_loop = inv_angle_val * last;
1254 LD_UB2(ref, 16, top0, top1);
1260 ST_UB2(top0, top1, ref_tmp, 16);
1266 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1267 offset = (inv_angle_val_loop + 128) >> 8;
1268 ref_tmp[h_cnt] = src_left_tmp[
offset];
1269 inv_angle_val_loop += inv_angle_val;
1275 for (v_cnt = 16; v_cnt--;) {
1276 idx0 = (angle_loop) >> 5;
1277 fact_val0 = (angle_loop) & 31;
1278 angle_loop += angle;
1280 idx1 = (angle_loop) >> 5;
1281 fact_val1 = (angle_loop) & 31;
1282 angle_loop += angle;
1284 top0 =
LD_UB(ref + idx0 + 1);
1285 top4 =
LD_UB(ref + idx1 + 1);
1286 top1 =
LD_UB(ref + idx0 + 17);
1287 top5 =
LD_UB(ref + idx1 + 17);
1288 top3 =
LD_UB(ref + idx0 + 33);
1289 top7 =
LD_UB(ref + idx1 + 33);
1291 fact0 = __msa_fill_h(fact_val0);
1292 fact1 = __msa_fill_h(32 - fact_val0);
1293 fact2 = __msa_fill_h(fact_val1);
1294 fact3 = __msa_fill_h(32 - fact_val1);
1299 SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1300 SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1310 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1311 diff2, diff3, diff6, diff7);
1312 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1313 diff10, diff11, diff14, diff15);
1315 diff2 += diff0 * fact1;
1316 diff3 += diff1 * fact1;
1317 diff6 += diff4 * fact1;
1318 diff7 += diff5 * fact1;
1319 diff10 += diff8 * fact3;
1320 diff11 += diff9 * fact3;
1321 diff14 += diff12 * fact3;
1322 diff15 += diff13 * fact3;
1326 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1327 dst0, dst1, dst2, dst3);
1329 ST_SB2(dst0, dst1, dst, 16);
1331 ST_SB2(dst2, dst3, dst, 16);
1342 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1343 uint8_t ref_array[3 * 32 + 4];
1344 uint8_t *ref_tmp = ref_array + 4;
1347 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1348 int32_t idx2, fact_val2, idx3, fact_val3;
1349 int32_t angle, angle_loop, inv_angle_val;
1351 v16i8 dst_val0, dst_val1;
1352 v16u8 top0, top1, top2, top3;
1354 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1355 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1363 inv_angle_val = inv_angle[mode - 11];
1368 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1369 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1370 ref_tmp[h_cnt] = src_top[
offset];
1376 idx0 = angle_loop >> 5;
1377 fact_val0 = angle_loop & 31;
1378 angle_loop += angle;
1380 idx1 = angle_loop >> 5;
1381 fact_val1 = angle_loop & 31;
1382 angle_loop += angle;
1384 idx2 = angle_loop >> 5;
1385 fact_val2 = angle_loop & 31;
1386 angle_loop += angle;
1388 idx3 = angle_loop >> 5;
1389 fact_val3 = angle_loop & 31;
1391 top0 =
LD_UB(ref + idx0 + 1);
1392 top1 =
LD_UB(ref + idx1 + 1);
1393 top2 =
LD_UB(ref + idx2 + 1);
1394 top3 =
LD_UB(ref + idx3 + 1);
1396 fact0 = __msa_fill_h(fact_val0);
1397 fact1 = __msa_fill_h(32 - fact_val0);
1398 fact2 = __msa_fill_h(fact_val1);
1399 fact3 = __msa_fill_h(32 - fact_val1);
1400 fact4 = __msa_fill_h(fact_val2);
1401 fact5 = __msa_fill_h(32 - fact_val2);
1402 fact6 = __msa_fill_h(fact_val3);
1403 fact7 = __msa_fill_h(32 - fact_val3);
1405 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1406 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1407 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1408 diff0, diff2, diff4, diff6);
1409 SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1410 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1411 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1412 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1414 diff1 += diff0 * fact1;
1415 diff3 += diff2 * fact3;
1418 PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1420 diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1421 diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1423 diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1425 dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1426 dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1439 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1440 uint8_t ref_array[3 * 32 + 4];
1441 uint8_t *ref_tmp = ref_array + 8;
1443 const uint8_t *src_top_tmp = src_top - 1;
1446 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1447 int32_t idx2, fact_val2, idx3, fact_val3;
1448 int32_t angle, angle_loop, inv_angle_val;
1449 v16i8 top0, top1, top2, top3;
1450 v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1451 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1452 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455 last = (angle) >> 2;
1460 inv_angle_val = inv_angle[mode - 11];
1466 SW(tmp1, ref_tmp + 4);
1467 SW(tmp2, ref_tmp + 8);
1469 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1470 offset = (h_cnt * inv_angle_val + 128) >> 8;
1471 ref_tmp[h_cnt] = src_top_tmp[
offset];
1477 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480 idx0 = angle_loop >> 5;
1481 fact_val0 = angle_loop & 31;
1482 angle_loop += angle;
1484 idx1 = angle_loop >> 5;
1485 fact_val1 = angle_loop & 31;
1486 angle_loop += angle;
1488 idx2 = angle_loop >> 5;
1489 fact_val2 = angle_loop & 31;
1490 angle_loop += angle;
1492 idx3 = angle_loop >> 5;
1493 fact_val3 = angle_loop & 31;
1494 angle_loop += angle;
1496 top0 =
LD_SB(ref + idx0 + 1);
1497 top1 =
LD_SB(ref + idx1 + 1);
1498 top2 =
LD_SB(ref + idx2 + 1);
1499 top3 =
LD_SB(ref + idx3 + 1);
1501 fact0 = __msa_fill_h(fact_val0);
1502 fact1 = __msa_fill_h(32 - fact_val0);
1503 fact2 = __msa_fill_h(fact_val1);
1504 fact3 = __msa_fill_h(32 - fact_val1);
1505 fact4 = __msa_fill_h(fact_val2);
1506 fact5 = __msa_fill_h(32 - fact_val2);
1507 fact6 = __msa_fill_h(fact_val3);
1508 fact7 = __msa_fill_h(32 - fact_val3);
1514 SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1515 SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1516 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1517 diff1, diff3, diff5, diff7);
1519 diff1 += diff0 * fact1;
1520 diff3 += diff2 * fact3;
1521 diff5 += diff4 * fact5;
1522 diff7 += diff6 * fact7;
1525 PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1526 dst_val0, dst_val1, dst_val2, dst_val3);
1527 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1529 ST4x8_UB(diff3, diff4, dst_org, stride);
1540 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1541 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1542 int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1543 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1544 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1545 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1546 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1547 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1549 uint8_t ref_array[3 * 32 + 4];
1550 uint8_t *ref_tmp = ref_array + 16;
1551 const uint8_t *
ref, *src_top_tmp = src_top - 1;
1556 last = (angle) >> 1;
1561 inv_angle_val = inv_angle[mode - 11];
1564 tmp0 =
LW(ref + 16);
1565 ST_SB(top0, ref_tmp);
1566 SW(tmp0, ref_tmp + 16);
1568 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1569 offset = (h_cnt * inv_angle_val + 128) >> 8;
1570 ref_tmp[h_cnt] = src_top_tmp[
offset];
1576 for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579 idx0 = angle_loop >> 5;
1580 fact_val0 = angle_loop & 31;
1581 angle_loop += angle;
1583 idx1 = angle_loop >> 5;
1584 fact_val1 = angle_loop & 31;
1585 angle_loop += angle;
1587 idx2 = angle_loop >> 5;
1588 fact_val2 = angle_loop & 31;
1589 angle_loop += angle;
1591 idx3 = angle_loop >> 5;
1592 fact_val3 = angle_loop & 31;
1593 angle_loop += angle;
1595 LD_SB2(ref + idx0 + 1, 16, top0, top1);
1596 LD_SB2(ref + idx1 + 1, 16, top2, top3);
1597 LD_SB2(ref + idx2 + 1, 16, top4, top5);
1598 LD_SB2(ref + idx3 + 1, 16, top6, top7);
1600 fact0 = __msa_fill_h(fact_val0);
1601 fact1 = __msa_fill_h(32 - fact_val0);
1602 fact2 = __msa_fill_h(fact_val1);
1603 fact3 = __msa_fill_h(32 - fact_val1);
1604 fact4 = __msa_fill_h(fact_val2);
1605 fact5 = __msa_fill_h(32 - fact_val2);
1606 fact6 = __msa_fill_h(fact_val3);
1607 fact7 = __msa_fill_h(32 - fact_val3);
1609 SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1610 SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1621 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1622 diff2, diff3, diff6, diff7);
1623 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1624 diff10, diff11, diff14, diff15);
1626 diff2 += diff0 * fact1;
1627 diff3 += diff1 * fact1;
1628 diff6 += diff4 * fact3;
1629 diff7 += diff5 * fact3;
1630 diff10 += diff8 * fact5;
1631 diff11 += diff9 * fact5;
1632 diff14 += diff12 * fact7;
1633 diff15 += diff13 * fact7;
1637 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1638 dst_val0, dst_val1, dst_val2, dst_val3);
1639 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1640 ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643 ST4x8_UB(diff4, diff5, dst_org, stride);
1645 ST4x8_UB(diff6, diff7, dst_org, stride);
1656 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1657 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1658 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1659 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1660 v8i16 fact0, fact1, fact2, fact3;
1661 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1662 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1664 uint8_t ref_array[3 * 32 + 4];
1665 uint8_t *ref_tmp = ref_array + 32;
1666 const uint8_t *
ref, *src_top_tmp = src_top - 1;
1676 inv_angle_val = inv_angle[mode - 11];
1678 LD_SB2(ref, 16, top0, top1);
1679 tmp0 =
LW(ref + 32);
1680 ST_SB2(top0, top1, ref_tmp, 16);
1681 SW(tmp0, ref_tmp + 32);
1683 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1684 offset = (h_cnt * inv_angle_val + 128) >> 8;
1685 ref_tmp[h_cnt] = src_top_tmp[
offset];
1691 for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1693 idx0 = angle_loop >> 5;
1694 fact_val0 = angle_loop & 31;
1695 angle_loop += angle;
1697 idx1 = angle_loop >> 5;
1698 fact_val1 = angle_loop & 31;
1699 angle_loop += angle;
1701 top0 =
LD_SB(ref + idx0 + 1);
1702 top4 =
LD_SB(ref + idx1 + 1);
1703 top1 =
LD_SB(ref + idx0 + 17);
1704 top5 =
LD_SB(ref + idx1 + 17);
1705 top3 =
LD_SB(ref + idx0 + 33);
1706 top7 =
LD_SB(ref + idx1 + 33);
1708 fact0 = __msa_fill_h(fact_val0);
1709 fact1 = __msa_fill_h(32 - fact_val0);
1710 fact2 = __msa_fill_h(fact_val1);
1711 fact3 = __msa_fill_h(32 - fact_val1);
1716 SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1717 SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1728 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1729 diff2, diff3, diff6, diff7);
1730 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1731 diff10, diff11, diff14, diff15);
1733 diff2 += diff0 * fact1;
1734 diff3 += diff1 * fact1;
1735 diff6 += diff4 * fact1;
1736 diff7 += diff5 * fact1;
1737 diff10 += diff8 * fact3;
1738 diff11 += diff9 * fact3;
1739 diff14 += diff12 * fact3;
1740 diff15 += diff13 * fact3;
1744 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1745 dst_val0, dst_val1, dst_val2, dst_val3);
1749 ST2x4_UB(diff0, 0, dst_org, stride);
1751 ST2x4_UB(diff0, 4, dst_org, stride);
1753 ST2x4_UB(diff1, 0, dst_org, stride);
1755 ST2x4_UB(diff1, 4, dst_org, stride);
1758 ST2x4_UB(diff2, 0, dst_org, stride);
1760 ST2x4_UB(diff2, 4, dst_org, stride);
1762 ST2x4_UB(diff3, 0, dst_org, stride);
1764 ST2x4_UB(diff3, 4, dst_org, stride);
1778 src2 =
LD_UB(src + 16);
1780 for (row = 32; row--;) {
1781 ST_UB2(src1, src2, dst, 16);
1848 }
else if (mode == 26) {
1850 }
else if (mode >= 18) {
1866 }
else if (mode == 26) {
1868 }
else if (mode >= 18) {
1884 }
else if (mode == 26) {
1886 }
else if (mode >= 18) {
1902 }
else if (mode == 26) {
1904 }
else if (mode >= 18) {
1920 int size_in_luma_h = 16 << hshift;
1922 int size_in_luma_v = 16 << vshift;
1924 int x = x0 >> hshift;
1925 int y = y0 >> vshift;
1940 uint8_t left_array[2 * 32 + 1];
1941 uint8_t filtered_left_array[2 * 32 + 1];
1942 uint8_t top_array[2 * 32 + 1];
1943 uint8_t filtered_top_array[2 * 32 + 1];
1945 uint8_t *left = left_array + 1;
1947 uint8_t *filtered_left = filtered_left_array + 1;
1948 uint8_t *filtered_top = filtered_top_array + 1;
1961 int bottom_left_size =
1962 (((y0 + 2 * size_in_luma_v) >
1964 2 * size_in_luma_v)) -
1965 (y0 + size_in_luma_v)) >> vshift;
1966 int top_right_size =
1967 (((x0 + 2 * size_in_luma_h) >
1969 (x0 + size_in_luma_h)) >> hshift;
1976 if (!size_in_luma_pu_h)
1977 size_in_luma_pu_h++;
1978 if (cand_bottom_left == 1 && on_pu_edge_x) {
1983 ((size_in_luma_pu_v) >
1986 y_bottom_pu) : (size_in_luma_pu_v));
1987 cand_bottom_left = 0;
1988 for (i = 0; i < max; i += 2)
1992 i) * min_pu_width]).pred_flag ==
1995 if (cand_left == 1 && on_pu_edge_x) {
1999 ((size_in_luma_pu_v) >
2002 y_left_pu) : (size_in_luma_pu_v));
2004 for (i = 0; i < max; i += 2)
2008 i) * min_pu_width]).pred_flag ==
2011 if (cand_up_left == 1) {
2016 (y_top_pu) * min_pu_width]).pred_flag ==
2019 if (cand_up == 1 && on_pu_edge_y) {
2023 ((size_in_luma_pu_h) >
2026 x_top_pu) : (size_in_luma_pu_h));
2028 for (i = 0; i < max; i += 2)
2032 min_pu_width]).pred_flag ==
PF_INTRA);
2034 if (cand_up_right == 1 && on_pu_edge_y) {
2039 ((size_in_luma_pu_h) >
2042 x_right_pu) : (size_in_luma_pu_h));
2044 for (i = 0; i < max; i += 2)
2048 min_pu_width]).pred_flag ==
PF_INTRA);
2051 vec0 = (v16u8) __msa_ldi_b(128);
2053 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2055 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2060 left[-1] = src[(-1) + stride * (-1)];
2064 vec0 =
LD_UB(src - stride);
2067 if (cand_up_right) {
2068 vec0 =
LD_UB(src - stride + 16);
2069 ST_UB(vec0, (top + 16));
2073 ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2075 for (i = 0; i < (16 - top_right_size); i += 4)
2081 for (i = 0; i < 16; i++)
2082 left[i] = src[(-1) + stride * (i)];
2083 if (cand_bottom_left) {
2084 for (i = 16; i < 16 + bottom_left_size; i++)
2085 left[i] = src[(-1) + stride * (i)];
2088 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2090 for (i = 0; i < (16 - bottom_left_size); i += 4)
2091 ((((
union unaligned_32 *) (left + 16 + bottom_left_size +
2097 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2100 x0 + ((2 * 16) << hshift) <
2103 y0 + ((2 * 16) << vshift) <
2105 int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2106 if (!cand_up_right) {
2107 size_max_x = x0 + ((16) << hshift) < s->
ps.
sps->
width ?
2110 if (!cand_bottom_left) {
2111 size_max_y = y0 + ((16) << vshift) < s->
ps.
sps->
height ?
2114 if (cand_bottom_left || cand_left || cand_up_left) {
2118 ((-1) << hshift)) >> s->
ps.
sps->
2119 log2_min_pu_size)) + (((y0 +
2124 * min_pu_width]).pred_flag ==
2129 ((-1) << hshift)) >> s->
ps.
sps->
2130 log2_min_pu_size)) + (((y0 + ((j)
2135 * min_pu_width]).pred_flag ==
PF_INTRA)) {
2137 while (j < size_max_x
2140 ((j) << hshift)) >> s->
ps.
sps->
2141 log2_min_pu_size)) + (((y0 +
2147 * min_pu_width]).pred_flag ==
2150 for (i = j; i > (j) - (j + 1); i--)
2154 1) << hshift)) >> s->
ps.
sps->
2155 log2_min_pu_size)) + (((y0 +
2161 * min_pu_width]).pred_flag ==
2163 top[i - 1] = top[i];
2168 while (j < size_max_x
2171 ((j) << hshift)) >> s->
ps.
sps->
2172 log2_min_pu_size)) + (((y0 + ((-1)
2177 * min_pu_width]).pred_flag ==
2182 for (i = j; i > (j) - (j + 1); i--)
2193 min_pu_width]).pred_flag ==
2195 top[i - 1] = top[i];
2197 for (i = j; i > (j) - (j); i--)
2208 min_pu_width]).pred_flag ==
2210 top[i - 1] = top[i];
2216 if (cand_bottom_left || cand_left) {
2217 a = ((left[-1]) * 0x01010101U);
2218 for (i = 0; i < (0) + (size_max_y); i += 4)
2221 ((-1) << hshift)) >> s->
ps.
sps->
2222 log2_min_pu_size)) + (((y0 +
2227 * min_pu_width]).pred_flag ==
2231 a = ((left[i + 3]) * 0x01010101U);
2234 vec0 = (v16u8) __msa_fill_b(left[-1]);
2238 if (!cand_bottom_left) {
2240 vec0 = (v16u8) __msa_fill_b(left[15]);
2242 ST_UB(vec0, (left + 16));
2244 if (x0 != 0 && y0 != 0) {
2245 a = ((left[size_max_y - 1]) * 0x01010101U);
2246 for (i = (size_max_y - 1);
2247 i > (size_max_y - 1) - (size_max_y); i -= 4)
2250 ((-1) << hshift)) >> s->
ps.
sps->
2251 log2_min_pu_size)) + (((y0 +
2257 * min_pu_width]).pred_flag ==
2261 a = ((left[i - 3]) * 0x01010101U);
2264 ((-1) << hshift)) >> s->
ps.
sps->
2265 log2_min_pu_size)) + (((y0 + ((-1)
2270 * min_pu_width]).pred_flag ==
PF_INTRA))
2272 }
else if (x0 == 0) {
2274 uint32_t pix = ((0) * 0x01010101U);
2275 for (i = 0; i < (size_max_y); i += 4)
2279 a = ((left[size_max_y - 1]) * 0x01010101U);
2280 for (i = (size_max_y - 1);
2281 i > (size_max_y - 1) - (size_max_y); i -= 4)
2284 ((-1) << hshift)) >> s->
ps.
sps->
2285 log2_min_pu_size)) + (((y0 +
2291 * min_pu_width]).pred_flag ==
2295 a = ((left[i - 3]) * 0x01010101U);
2299 a = ((left[-1]) * 0x01010101U);
2300 for (i = 0; i < (0) + (size_max_x); i += 4)
2303 ((i) << hshift)) >> s->
ps.
sps->
2304 log2_min_pu_size)) + (((y0 + ((-1)
2309 * min_pu_width]).pred_flag ==
2313 a = ((top[i + 3]) * 0x01010101U);
2318 if (!cand_bottom_left) {
2320 vec0 = (v16u8) __msa_fill_b(left[15]);
2322 ST_UB(vec0, (left + 16));
2324 }
else if (cand_up_left) {
2325 vec0 = (v16u8) __msa_fill_b(left[-1]);
2327 ST_UB2(vec0, vec0, left, 16);
2330 }
else if (cand_up) {
2333 vec0 = (v16u8) __msa_fill_b(left[-1]);
2335 ST_UB2(vec0, vec0, left, 16);
2339 }
else if (cand_up_right) {
2340 vec0 = (v16u8) __msa_fill_b(top[16]);
2346 ST_UB2(vec0, vec0, left, 16);
2353 vec0 = (v16u8) __msa_ldi_b(128);
2355 ST_UB2(vec0, vec0, top, 16);
2356 ST_UB2(vec0, vec0, left, 16);
2361 vec0 = (v16u8) __msa_fill_b(left[16]);
2364 if (!cand_up_left) {
2368 vec0 = (v16u8) __msa_fill_b(left[-1]);
2371 if (!cand_up_right) {
2372 vec0 = (v16u8) __msa_fill_b(top[15]);
2373 ST_UB(vec0, (top + 16));
2382 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2383 int min_dist_vert_hor =
2384 (((((int) (
mode - 26
U)) >=
2385 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
2386 ((((int) (
mode - 10
U)) >=
2387 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2388 ? ((((int) (
mode - 10
U)) >=
2389 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2390 : ((((int) (
mode - 26
U)) >=
2391 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
2392 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2393 filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2394 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2395 for (i = 2 * 16 - 2; i >= 0; i--)
2396 filtered_left[i] = (left[i + 1] + 2 * left[i] +
2397 left[i - 1] + 2) >> 2;
2400 (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2401 for (i = 2 * 16 - 2; i >= 0; i--)
2402 filtered_top[i] = (top[i + 1] + 2 * top[i] +
2403 top[i - 1] + 2) >> 2;
2404 left = filtered_left;
2417 (
uint8_t *) left, stride, 4, c_idx);
2429 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2430 v8i16 res0, res1, res2, res3;
2431 v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2432 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2437 int size_in_luma_h = 32 << hshift;
2439 int size_in_luma_v = 32 << vshift;
2441 int x = x0 >> hshift;
2442 int y = y0 >> vshift;
2457 uint8_t left_array[2 * 32 + 1];
2458 uint8_t filtered_left_array[2 * 32 + 1];
2459 uint8_t top_array[2 * 32 + 1];
2460 uint8_t filtered_top_array[2 * 32 + 1];
2462 uint8_t *left = left_array + 1;
2464 uint8_t *filtered_left = filtered_left_array + 1;
2465 uint8_t *filtered_top = filtered_top_array + 1;
2478 int bottom_left_size =
2479 (((y0 + 2 * size_in_luma_v) >
2481 2 * size_in_luma_v)) -
2482 (y0 + size_in_luma_v)) >> vshift;
2483 int top_right_size =
2484 (((x0 + 2 * size_in_luma_h) >
2486 (x0 + size_in_luma_h)) >> hshift;
2493 if (!size_in_luma_pu_h)
2494 size_in_luma_pu_h++;
2495 if (cand_bottom_left == 1 && on_pu_edge_x) {
2500 ((size_in_luma_pu_v) >
2503 y_bottom_pu) : (size_in_luma_pu_v));
2504 cand_bottom_left = 0;
2505 for (i = 0; i < max; i += 2)
2509 i) * min_pu_width]).pred_flag ==
2512 if (cand_left == 1 && on_pu_edge_x) {
2516 ((size_in_luma_pu_v) >
2519 y_left_pu) : (size_in_luma_pu_v));
2521 for (i = 0; i < max; i += 2)
2525 i) * min_pu_width]).pred_flag ==
2528 if (cand_up_left == 1) {
2533 (y_top_pu) * min_pu_width]).pred_flag ==
2536 if (cand_up == 1 && on_pu_edge_y) {
2540 ((size_in_luma_pu_h) >
2543 x_top_pu) : (size_in_luma_pu_h));
2545 for (i = 0; i < max; i += 2)
2549 min_pu_width]).pred_flag ==
PF_INTRA);
2551 if (cand_up_right == 1 && on_pu_edge_y) {
2556 ((size_in_luma_pu_h) >
2559 x_right_pu) : (size_in_luma_pu_h));
2561 for (i = 0; i < max; i += 2)
2565 min_pu_width]).pred_flag ==
PF_INTRA);
2567 vec0 = (v16u8) __msa_ldi_b(128);
2569 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2570 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2575 left[-1] = src[(-1) + stride * (-1)];
2579 LD_UB2(src - stride, 16, vec0, vec1);
2580 ST_UB2(vec0, vec1, top, 16);
2583 if (cand_up_right) {
2584 LD_UB2(src - stride + 32, 16, vec0, vec1);
2585 ST_UB2(vec0, vec1, (top + 32), 16);
2588 ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2590 for (i = 0; i < (32 - top_right_size); i += 4)
2596 for (i = 0; i < 32; i++)
2597 left[i] = src[(-1) + stride * (i)];
2598 if (cand_bottom_left) {
2599 for (i = 32; i < 32 + bottom_left_size; i++)
2600 left[i] = src[(-1) + stride * (i)];
2603 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2605 for (i = 0; i < (32 - bottom_left_size); i += 4)
2606 ((((
union unaligned_32 *) (left + 32 + bottom_left_size +
2612 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2615 x0 + ((2 * 32) << hshift) <
2618 y0 + ((2 * 32) << vshift) <
2620 int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2621 if (!cand_up_right) {
2622 size_max_x = x0 + ((32) << hshift) < s->
ps.
sps->
width ?
2625 if (!cand_bottom_left) {
2626 size_max_y = y0 + ((32) << vshift) < s->
ps.
sps->
height ?
2629 if (cand_bottom_left || cand_left || cand_up_left) {
2633 ((-1) << hshift)) >> s->
ps.
sps->
2634 log2_min_pu_size)) + (((y0 +
2639 * min_pu_width]).pred_flag ==
2644 ((-1) << hshift)) >> s->
ps.
sps->
2645 log2_min_pu_size)) + (((y0 + ((j)
2650 * min_pu_width]).pred_flag ==
PF_INTRA)) {
2652 while (j < size_max_x
2655 ((j) << hshift)) >> s->
ps.
sps->
2656 log2_min_pu_size)) + (((y0 +
2662 * min_pu_width]).pred_flag ==
2665 for (i = j; i > (j) - (j + 1); i--)
2669 1) << hshift)) >> s->
ps.
sps->
2670 log2_min_pu_size)) + (((y0 +
2676 * min_pu_width]).pred_flag ==
2678 top[i - 1] = top[i];
2683 while (j < size_max_x
2686 ((j) << hshift)) >> s->
ps.
sps->
2687 log2_min_pu_size)) + (((y0 + ((-1)
2692 * min_pu_width]).pred_flag ==
2697 for (i = j; i > (j) - (j + 1); i--)
2708 min_pu_width]).pred_flag ==
2710 top[i - 1] = top[i];
2712 for (i = j; i > (j) - (j); i--)
2723 min_pu_width]).pred_flag ==
2725 top[i - 1] = top[i];
2731 if (cand_bottom_left || cand_left) {
2732 a = ((left[-1]) * 0x01010101U);
2733 for (i = 0; i < (0) + (size_max_y); i += 4)
2736 ((-1) << hshift)) >> s->
ps.
sps->
2737 log2_min_pu_size)) + (((y0 +
2742 * min_pu_width]).pred_flag ==
2746 a = ((left[i + 3]) * 0x01010101U);
2749 vec0 = (v16u8) __msa_fill_b(left[-1]);
2751 ST_UB2(vec0, vec0, left, 16);
2753 if (!cand_bottom_left) {
2754 vec0 = (v16u8) __msa_fill_b(left[31]);
2756 ST_UB2(vec0, vec0, (left + 32), 16);
2758 if (x0 != 0 && y0 != 0) {
2759 a = ((left[size_max_y - 1]) * 0x01010101U);
2760 for (i = (size_max_y - 1);
2761 i > (size_max_y - 1) - (size_max_y); i -= 4)
2764 ((-1) << hshift)) >> s->
ps.
sps->
2765 log2_min_pu_size)) + (((y0 +
2771 * min_pu_width]).pred_flag ==
2775 a = ((left[i - 3]) * 0x01010101U);
2778 ((-1) << hshift)) >> s->
ps.
sps->
2779 log2_min_pu_size)) + (((y0 + ((-1)
2784 * min_pu_width]).pred_flag ==
PF_INTRA))
2786 }
else if (x0 == 0) {
2788 uint32_t pix = ((0) * 0x01010101U);
2789 for (i = 0; i < (size_max_y); i += 4)
2793 a = ((left[size_max_y - 1]) * 0x01010101U);
2794 for (i = (size_max_y - 1);
2795 i > (size_max_y - 1) - (size_max_y); i -= 4)
2798 ((-1) << hshift)) >> s->
ps.
sps->
2799 log2_min_pu_size)) + (((y0 +
2805 * min_pu_width]).pred_flag ==
2809 a = ((left[i - 3]) * 0x01010101U);
2813 a = ((left[-1]) * 0x01010101U);
2814 for (i = 0; i < (0) + (size_max_x); i += 4)
2817 ((i) << hshift)) >> s->
ps.
sps->
2818 log2_min_pu_size)) + (((y0 + ((-1)
2823 * min_pu_width]).pred_flag ==
2827 a = ((top[i + 3]) * 0x01010101U);
2832 if (!cand_bottom_left) {
2834 vec0 = (v16u8) __msa_fill_b(left[31]);
2836 ST_UB2(vec0, vec0, (left + 32), 16);
2837 }
else if (cand_up_left) {
2838 vec0 = (v16u8) __msa_fill_b(left[-1]);
2840 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2843 }
else if (cand_up) {
2846 vec0 = (v16u8) __msa_fill_b(left[-1]);
2848 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2852 }
else if (cand_up_right) {
2853 vec0 = (v16u8) __msa_fill_b(top[32]);
2855 ST_UB2(vec0, vec0, top, 16);
2859 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2867 vec0 = (v16u8) __msa_ldi_b(128);
2869 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2870 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2875 vec0 = (v16u8) __msa_fill_b(left[32]);
2877 ST_UB2(vec0, vec0, left, 16);
2879 if (!cand_up_left) {
2883 vec0 = (v16u8) __msa_fill_b(left[-1]);
2885 ST_UB2(vec0, vec0, top, 16);
2887 if (!cand_up_right) {
2888 vec0 = (v16u8) __msa_fill_b(top[31]);
2890 ST_UB2(vec0, vec0, (top + 32), 16);
2899 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2900 int min_dist_vert_hor =
2901 (((((int) (
mode - 26
U)) >=
2902 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
2903 ((((int) (
mode - 10
U)) >=
2904 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2905 ? ((((int) (
mode - 10
U)) >=
2906 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2907 : ((((int) (
mode - 26
U)) >=
2908 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
2909 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2910 int threshold = 1 << (8 - 5);
2913 && ((top[-1] + top[63] - 2 * top[31]) >=
2914 0 ? (top[-1] + top[63] -
2915 2 * top[31]) : (-(top[-1] + top[63] -
2916 2 * top[31]))) < threshold
2917 && ((left[-1] + left[63] - 2 * left[31]) >=
2918 0 ? (left[-1] + left[63] -
2919 2 * left[31]) : (-(left[-1] + left[63] -
2920 2 * left[31]))) < threshold) {
2923 filtered_top[-1] = top[-1];
2924 filtered_top[63] = top[63];
2927 for (i = 0; i < 63; i++) {
2929 ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2932 tmp0 = __msa_fill_h(top[-1]);
2933 tmp1 = __msa_fill_h(top[63]);
2935 tmp2 = mul_val0 - 8;
2936 tmp3 = mul_val0 - 16;
2937 tmp4 = mul_val0 - 24;
2938 tmp5 = mul_val1 + 8;
2939 tmp6 = mul_val1 + 16;
2940 tmp7 = mul_val1 + 24;
2942 res0 = mul_val0 * tmp0;
2946 res0 += mul_val1 * tmp1;
2947 res1 += tmp5 * tmp1;
2948 res2 += tmp6 * tmp1;
2949 res3 += tmp7 * tmp1;
2951 res0 = __msa_srari_h(res0, 6);
2952 res1 = __msa_srari_h(res1, 6);
2953 res2 = __msa_srari_h(res2, 6);
2954 res3 = __msa_srari_h(res3, 6);
2956 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2957 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2959 ST_UB2(vec0, vec1, filtered_top, 16);
2961 res0 = mul_val0 - 32;
2962 tmp2 = mul_val0 - 40;
2963 tmp3 = mul_val0 - 48;
2964 tmp4 = mul_val0 - 56;
2965 res3 = mul_val1 + 32;
2966 tmp5 = mul_val1 + 40;
2967 tmp6 = mul_val1 + 48;
2968 tmp7 = mul_val1 + 56;
2973 res0 += res3 * tmp1;
2975 res1 += tmp5 * tmp1;
2976 res2 += tmp6 * tmp1;
2977 res3 += tmp7 * tmp1;
2979 res0 = __msa_srari_h(res0, 6);
2980 res1 = __msa_srari_h(res1, 6);
2981 res2 = __msa_srari_h(res2, 6);
2982 res3 = __msa_srari_h(res3, 6);
2984 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2985 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2987 ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2989 filtered_top[63] = top[63];
2991 tmp0 = __msa_fill_h(left[-1]);
2992 tmp1 = __msa_fill_h(left[63]);
2994 tmp2 = mul_val0 - 8;
2995 tmp3 = mul_val0 - 16;
2996 tmp4 = mul_val0 - 24;
2997 tmp5 = mul_val1 + 8;
2998 tmp6 = mul_val1 + 16;
2999 tmp7 = mul_val1 + 24;
3001 res0 = mul_val0 * tmp0;
3005 res0 += mul_val1 * tmp1;
3006 res1 += tmp5 * tmp1;
3007 res2 += tmp6 * tmp1;
3008 res3 += tmp7 * tmp1;
3010 res0 = __msa_srari_h(res0, 6);
3011 res1 = __msa_srari_h(res1, 6);
3012 res2 = __msa_srari_h(res2, 6);
3013 res3 = __msa_srari_h(res3, 6);
3015 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3016 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3018 ST_UB2(vec0, vec1, left, 16);
3020 res0 = mul_val0 - 32;
3021 tmp2 = mul_val0 - 40;
3022 tmp3 = mul_val0 - 48;
3023 tmp4 = mul_val0 - 56;
3024 res3 = mul_val1 + 32;
3025 tmp5 = mul_val1 + 40;
3026 tmp6 = mul_val1 + 48;
3027 tmp7 = mul_val1 + 56;
3032 res0 += res3 * tmp1;
3034 res1 += tmp5 * tmp1;
3035 res2 += tmp6 * tmp1;
3036 res3 += tmp7 * tmp1;
3038 res0 = __msa_srari_h(res0, 6);
3039 res1 = __msa_srari_h(res1, 6);
3040 res2 = __msa_srari_h(res2, 6);
3041 res3 = __msa_srari_h(res3, 6);
3043 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3044 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3046 ST_UB2(vec0, vec1, (left + 32), 16);
3052 filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3053 filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3054 for (i = 2 * 32 - 2; i >= 0; i--)
3055 filtered_left[i] = (left[i + 1] + 2 * left[i] +
3056 left[i - 1] + 2) >> 2;
3059 (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3060 for (i = 2 * 32 - 2; i >= 0; i--)
3061 filtered_top[i] = (top[i + 1] + 2 * top[i] +
3062 top[i - 1] + 2) >> 2;
3063 left = filtered_left;
3077 (
uint8_t *) left, stride, 5, c_idx);
const char const char void * val
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define MUL2(in0, in1, in2, in3, out0, out1)
static const int8_t intra_pred_angle_up[17]
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define SPLATI_H2_SH(...)
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
#define UNPCK_UB_SH(in, out0, out1)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
#define SLDI_B4_0_SH(...)
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
#define CLIP_SH2_0_255(in0, in1)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,res0, res1, mul_val_b0, mul_val_b1, round)
void(* pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx)
static const int8_t intra_pred_angle_low[16]
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
#define SW4(in0, in1, in2, in3, pdst, stride)
static const uint8_t offset[127][2]
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
#define ST2x4_UB(in, stidx, pdst, stride)
uint8_t constrained_intra_pred_flag
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
int linesize[AV_NUM_DATA_POINTERS]
For video, size in bytes of each picture line.
unsigned int log2_min_pu_size
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
void(* pred_angular[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
unsigned int log2_min_tb_size
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
HEVCLocalContext * HEVClc
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
#define ST4x8_UB(in0, in1, pdst, stride)
#define INSERT_D2_UB(...)
GLint GLenum GLboolean GLsizei stride
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static int ref[MAX_W *MAX_W]
#define SUB2(in0, in1, in2, in3, out0, out1)
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
#define ST8x4_UB(in0, in1, pdst, stride)
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
uint8_t sps_strong_intra_smoothing_enable_flag
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
void(* pred_planar[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride)
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
int * min_tb_addr_zs
MinTbAddrZS.
#define ST4x2_UB(in, pdst, stride)
int intra_smoothing_disabled_flag
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)