25 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
29 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
30 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
31 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
32 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
36 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
37 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
38 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
39 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
40 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
41 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
42 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
43 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
44 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
45 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
46 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
47 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
48 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
49 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
50 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
51 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
55 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
56 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
57 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
58 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
62 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
65 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \ 66 sum0, sum1, sum2, sum3, shift) \ 68 v4i32 vec0, vec1, vec2, vec3, vec4, vec5; \ 69 v4i32 cnst64 = __msa_ldi_w(64); \ 70 v4i32 cnst83 = __msa_ldi_w(83); \ 71 v4i32 cnst36 = __msa_ldi_w(36); \ 73 DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64, \ 74 cnst83, cnst36, vec0, vec2, vec1, vec3); \ 75 DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5); \ 90 SRARI_W4_SW(sum0, sum1, sum2, sum3, shift); \ 91 SAT_SW4_SW(sum0, sum1, sum2, sum3, 15); \ 94 #define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \ 96 v8i16 src0_r, src1_r, src2_r, src3_r; \ 97 v8i16 src0_l, src1_l, src2_l, src3_l; \ 98 v8i16 filt0, filter0, filter1, filter2, filter3; \ 99 v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \ 100 v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \ 101 v4i32 sum0_r, sum1_r, sum2_r, sum3_r; \ 102 v4i32 sum0_l, sum1_l, sum2_l, sum3_l; \ 104 ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \ 105 src0_r, src1_r, src2_r, src3_r); \ 106 ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \ 107 src0_l, src1_l, src2_l, src3_l); \ 109 filt0 = LD_SH(filter); \ 110 SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \ 111 DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \ 112 filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \ 114 BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \ 121 DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \ 122 filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \ 124 temp2_r += temp3_r; \ 125 temp2_l += temp3_l; \ 131 SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \ 132 SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \ 133 PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7); \ 134 DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \ 135 filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \ 137 temp4_r -= temp5_r; \ 138 temp4_l -= temp5_l; \ 144 SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \ 145 SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \ 146 PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4); \ 148 filt0 = LD_SH(filter + 8); \ 149 SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \ 150 DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \ 151 filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \ 153 BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \ 160 DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \ 161 filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \ 163 temp2_r += temp3_r; \ 164 temp2_l += temp3_l; \ 170 SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \ 171 SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \ 172 PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6); \ 173 DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \ 174 filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \ 176 temp4_r -= temp5_r; \ 177 temp4_l -= temp5_l; \ 183 SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \ 184 SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \ 185 PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5); \ 188 #define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \ 189 src4_r, src5_r, src6_r, src7_r, \ 190 src0_l, src1_l, src2_l, src3_l, \ 191 src4_l, src5_l, src6_l, src7_l, shift) \ 193 int16_t *ptr0, *ptr1; \ 194 v8i16 filt0, filt1, dst0, dst1; \ 195 v8i16 filter0, filter1, filter2, filter3; \ 196 v4i32 temp0_r, temp1_r, temp0_l, temp1_l; \ 197 v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \ 198 v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l; \ 200 ptr0 = (buf_ptr + 112); \ 201 ptr1 = (buf_ptr + 128); \ 204 for (j = 0; j < 4; j++) \ 206 LD_SH2(filter, 8, filt0, filt1) \ 208 SPLATI_W2_SH(filt0, 0, filter0, filter1); \ 209 SPLATI_W2_SH(filt1, 0, filter2, filter3); \ 210 DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l, filter0, filter0, \ 211 filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l); \ 212 DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l); \ 213 DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l, filter1, filter1, \ 214 filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \ 215 DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l); \ 220 SPLATI_W2_SH(filt0, 2, filter0, filter1); \ 221 SPLATI_W2_SH(filt1, 2, filter2, filter3); \ 222 DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l); \ 223 DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l); \ 224 DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l); \ 231 sum3_r = temp1_r - sum3_r; \ 232 sum3_l = temp1_l - sum3_l; \ 234 DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l); \ 235 DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3, \ 236 filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \ 243 BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \ 245 SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \ 246 SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \ 247 PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \ 248 ST_SH(dst0, buf_ptr); \ 249 ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16))); \ 251 BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \ 253 SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \ 254 SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \ 255 PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \ 256 ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16))); \ 257 ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16))); \ 264 #define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \ 266 LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l); \ 271 ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4); \ 274 ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4); \ 277 #define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \ 278 res0, res1, res2, res3, shift) \ 280 v4i32 vec0, vec1, vec2, vec3; \ 281 v4i32 cnst74 = __msa_ldi_w(74); \ 282 v4i32 cnst55 = __msa_ldi_w(55); \ 283 v4i32 cnst29 = __msa_ldi_w(29); \ 285 vec0 = in_r0 + in_r1; \ 286 vec2 = in_r0 - in_l1; \ 287 res0 = vec0 * cnst29; \ 288 res1 = vec2 * cnst55; \ 289 res2 = in_r0 - in_r1; \ 290 vec1 = in_r1 + in_l1; \ 292 vec3 = in_l0 * cnst74; \ 293 res3 = vec0 * cnst55; \ 295 res0 += vec1 * cnst55; \ 296 res1 -= vec1 * cnst29; \ 298 res3 += vec2 * cnst29; \ 304 SRARI_W4_SW(res0, res1, res2, res3, shift); \ 305 SAT_SW4_SW(res0, res1, res2, res3, 15); \ 311 v4i32 in_r0, in_l0, in_r1, in_l1;
312 v4i32 sum0, sum1, sum2, sum3;
315 LD_SH2(coeffs, 8, in0, in1);
328 ST_SH2(in0, in1, coeffs, 8);
334 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
336 LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
339 in0, in1, in2, in3, in4, in5, in6, in7);
342 in0, in1, in2, in3, in4, in5, in6, in7);
343 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
350 int16_t *buf_ptr = &buf[0];
351 int16_t *
src = coeffs;
353 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
355 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
356 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
357 v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
360 LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
361 in8, in9, in10, in11, in12, in13, in14, in15);
363 ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
364 src0_r, src1_r, src2_r, src3_r);
365 ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
366 src4_r, src5_r, src6_r, src7_r);
367 ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
368 src0_l, src1_l, src2_l, src3_l);
369 ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
370 src4_l, src5_l, src6_l, src7_l);
372 src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
373 src4_l, src5_l, src6_l, src7_l, 7);
376 buf_ptr = (&buf[0] + 8);
385 LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
386 in4, in12, in5, in13, in6, in14, in7, in15);
388 in0, in1, in2, in3, in4, in5, in6, in7);
390 in8, in9, in10, in11, in12, in13, in14, in15);
391 ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
392 src0_r, src1_r, src2_r, src3_r);
393 ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
394 src4_r, src5_r, src6_r, src7_r);
395 ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
396 src0_l, src1_l, src2_l, src3_l);
397 ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
398 src4_l, src5_l, src6_l, src7_l);
400 src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
401 src4_l, src5_l, src6_l, src7_l, 12);
404 buf_ptr = coeffs + 8;
408 LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
410 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
411 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
413 LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
415 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
416 LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
417 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
419 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
420 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
422 LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
424 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
425 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
436 int16_t *
src0 = (coeffs + buf_pitch);
437 int16_t *
src1 = (coeffs + 2 * buf_pitch);
438 int16_t *src2 = (coeffs + 4 * buf_pitch);
439 int16_t *src3 = (coeffs);
442 int32_t *tmp_buf_ptr = tmp_buf + 15;
443 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
444 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
445 v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
447 v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
450 tmp_buf_ptr = (
int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
453 LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
454 ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
455 ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
457 LD_SH2(src3, 16 * buf_pitch, in4, in6);
458 LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7);
459 ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r);
460 ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l);
463 for (i = 0; i < 2; i++) {
465 cnst0 =
LW(filter_ptr2);
466 cnst1 =
LW(filter_ptr2 + 2);
468 filter0 = (v8i16) __msa_fill_w(cnst0);
469 filter1 = (v8i16) __msa_fill_w(cnst1);
471 DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
472 DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
473 ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4);
476 cnst0 =
LW(filter_ptr2 + 4);
477 cnst1 =
LW(filter_ptr2 + 6);
479 filter0 = (v8i16) __msa_fill_w(cnst0);
480 filter1 = (v8i16) __msa_fill_w(cnst1);
482 DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
483 DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
484 ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4);
491 for (i = 0; i < 2; i++) {
493 cnst0 =
LW(filter_ptr3);
494 cnst1 =
LW(filter_ptr3 + 2);
496 filter0 = (v8i16) __msa_fill_w(cnst0);
497 filter1 = (v8i16) __msa_fill_w(cnst1);
499 DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1,
500 filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
502 sum1_r = sum0_r - tmp1_r;
503 sum1_l = sum0_l - tmp1_l;
504 sum0_r = sum0_r + tmp1_r;
505 sum0_l = sum0_l + tmp1_l;
514 LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
515 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
516 src0_r, src1_r, src2_r, src3_r);
517 ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
518 src0_l, src1_l, src2_l, src3_l);
521 for (i = 0; i < 8; i++) {
523 filt0 =
LD_SH(filter_ptr1);
524 SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
525 DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
526 DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
527 filter2, sum0_r, sum0_l, sum0_r, sum0_l);
528 DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
530 LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
535 ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
538 ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
544 LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
545 src0 += 16 * buf_pitch;
546 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
547 src0_r, src1_r, src2_r, src3_r);
548 ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
549 src0_l, src1_l, src2_l, src3_l);
551 LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
552 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
553 src4_r, src5_r, src6_r, src7_r);
554 ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
555 src4_l, src5_l, src6_l, src7_l);
558 for (i = 0; i < 16; i++) {
560 filt0 =
LD_SH(filter_ptr0);
561 SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
562 DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
563 DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
564 filter2, sum0_r, sum0_l, sum0_r, sum0_l);
565 DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
570 filt0 =
LD_SH(filter_ptr0 + 8);
571 SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
572 DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
573 DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
574 filter2, sum0_r, sum0_l, sum0_r, sum0_l);
575 DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
580 LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
585 sum1_r = __msa_fill_w(round);
588 in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
589 ST_SH(in0, (coeffs + i * buf_pitch));
594 in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
595 ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
604 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
606 for (i = 0; i < 4; i++) {
607 LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
609 in0, in1, in2, in3, in4, in5, in6, in7);
610 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
617 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
619 for (i = 0; i < 4; i++) {
620 LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
622 in0, in1, in2, in3, in4, in5, in6, in7);
623 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
630 int16_t *
src = coeffs;
631 int16_t tmp_buf[8 * 32 + 31];
632 int16_t *tmp_buf_ptr = tmp_buf + 31;
637 tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
642 for (col_cnt = 0; col_cnt < 4; col_cnt++) {
650 for (row_cnt = 0; row_cnt < 4; row_cnt++) {
652 src = (coeffs + 32 * 8 * row_cnt);
665 val = (coeffs[0] + 1) >> 1;
666 val = (val + 32) >> 6;
667 dst = __msa_fill_h(val);
669 ST_SH2(dst, dst, coeffs, 8);
677 val = (coeffs[0] + 1) >> 1;
678 val = (val + 32) >> 6;
679 dst = __msa_fill_h(val);
681 ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
690 val = (coeffs[0] + 1) >> 1;
691 val = (val + 32) >> 6;
692 dst = __msa_fill_h(val);
694 for (loop = 4; loop--;) {
695 ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
706 val = (coeffs[0] + 1) >> 1;
707 val = (val + 32) >> 6;
708 dst = __msa_fill_h(val);
710 for (loop = 16; loop--;) {
711 ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
718 uint32_t dst0, dst1, dst2, dst3;
719 v8i16 dst_r0, dst_l0, in0, in1;
720 v4i32 dst_vec = { 0 };
723 LD_SH2(coeffs, 8, in0, in1);
724 LW4(dst, stride, dst0, dst1, dst2, dst3);
727 ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
729 dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
730 ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
736 uint64_t dst0, dst1, dst2, dst3;
737 v2i64 dst_vec0 = { 0 };
738 v2i64 dst_vec1 = { 0 };
739 v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
740 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
743 LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
744 LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
751 ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
752 dst_r0, dst_l0, dst_r1, dst_l1);
754 PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
755 ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
757 LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
762 ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
763 dst_r0, dst_l0, dst_r1, dst_l1);
765 PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
766 ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
773 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
774 v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
775 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
778 LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
780 LD_SH4(coeffs, 16, in0, in2, in4, in6);
781 LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
784 for (loop_cnt = 3; loop_cnt--;) {
800 LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
802 LD_SH4(coeffs, 16, in0, in2, in4, in6);
803 LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
807 dst_r2, dst_l2, dst_r3, dst_l3);
809 PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
810 dst_r3, dst0, dst1, dst2, dst3);
811 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
830 dst_r2, dst_l2, dst_r3, dst_l3);
831 PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
832 dst_r3, dst0, dst1, dst2, dst3);
833 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
840 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
841 v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
842 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
845 LD_UB2(temp_dst, 16, dst4, dst5);
847 LD_UB2(temp_dst, 16, dst6, dst7);
849 LD_SH4(coeffs, 16, in0, in2, in4, in6);
850 LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
853 for (loop_cnt = 14; loop_cnt--;) {
869 LD_UB2(temp_dst, 16, dst4, dst5);
871 LD_UB2(temp_dst, 16, dst6, dst7);
873 LD_SH4(coeffs, 16, in0, in2, in4, in6);
874 LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
878 dst_r2, dst_l2, dst_r3, dst_l3);
879 PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
880 dst_r3, dst0, dst1, dst2, dst3);
881 ST_UB2(dst0, dst1, dst, 16);
883 ST_UB2(dst2, dst3, dst, 16);
902 LD_UB2(temp_dst, 16, dst4, dst5);
904 LD_UB2(temp_dst, 16, dst6, dst7);
906 LD_SH4(coeffs, 16, in0, in2, in4, in6);
907 LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
910 dst_r2, dst_l2, dst_r3, dst_l3);
911 PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
912 dst_r3, dst0, dst1, dst2, dst3);
913 ST_UB2(dst0, dst1, dst, 16);
915 ST_UB2(dst2, dst3, dst, 16);
933 dst_r2, dst_l2, dst_r3, dst_l3);
934 PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
935 dst_r3, dst0, dst1, dst2, dst3);
936 ST_UB2(dst0, dst1, dst, 16);
938 ST_UB2(dst2, dst3, dst, 16);
943 v8i16 in0, in1, dst0, dst1;
944 v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
946 LD_SH2(coeffs, 8, in0, in1);
960 ST_SH2(dst0, dst1, coeffs, 8);
static const int16_t gt32x32_cnst1[64]
const char const char void * val
static void hevc_idct_8x8_msa(int16_t *coeffs)
void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,res0, res1, res2, res3, shift)
void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define UNPCK_UB_SH(in, out0, out1)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
#define CLIP_SH8_0_255(in0, in1, in2, in3,in4, in5, in6, in7)
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
#define CLIP_SH2_0_255(in0, in1)
void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
static const int16_t gt32x32_cnst0[256]
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static int aligned(int val)
static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
#define i(width, name, range_min, range_max)
void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define DPADD_SH4_SW(...)
static void hevc_idct_16x16_msa(int16_t *coeffs)
static const int16_t gt8x8_cnst[16]
void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
static av_always_inline av_const double round(double x)
#define TRANSPOSE8x8_SH_SH(...)
#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,sum0, sum1, sum2, sum3, shift)
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define SPLATI_W4_SH(...)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
static const int16_t gt32x32_cnst2[16]
static void hevc_idct_32x32_msa(int16_t *coeffs)
void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round)
#define INSERT_D2_SD(...)
#define UNPCK_SH_SW(in, out0, out1)
void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
#define INSERT_W4_SW(...)
void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
#define ADD2(in0, in1, in2, in3, out0, out1)
#define DPADD_SH2_SW(...)
#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,src4_r, src5_r, src6_r, src7_r,src0_l, src1_l, src2_l, src3_l,src4_l, src5_l, src6_l, src7_l, shift)
GLint GLenum GLboolean GLsizei stride
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)
static void hevc_idct_4x4_msa(int16_t *coeffs)
static const int16_t gt16x16_cnst[64]
static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)