00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifdef DEBUG_ALIGNMENT
00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00024 #else
00025 #define ASSERT_ALIGNED(ptr) ;
00026 #endif
00027
00028
00029
00030 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00031 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00032 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00033 \
00034 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00035 psum = vec_mladd(vB, vsrc1ssH, psum);\
00036 psum = vec_mladd(vC, vsrc2ssH, psum);\
00037 psum = vec_mladd(vD, vsrc3ssH, psum);\
00038 psum = BIAS2(psum);\
00039 psum = vec_sr(psum, v6us);\
00040 \
00041 vdst = vec_ld(0, dst);\
00042 ppsum = (vec_u8)vec_pack(psum, psum);\
00043 vfdst = vec_perm(vdst, ppsum, fperm);\
00044 \
00045 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00046 \
00047 vec_st(fsum, 0, dst);\
00048 \
00049 vsrc0ssH = vsrc2ssH;\
00050 vsrc1ssH = vsrc3ssH;\
00051 \
00052 dst += stride;\
00053 src += stride;
00054
00055 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00056 \
00057 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00058 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00059 \
00060 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00061 psum = vec_mladd(vE, vsrc1ssH, psum);\
00062 psum = vec_sr(psum, v6us);\
00063 \
00064 vdst = vec_ld(0, dst);\
00065 ppsum = (vec_u8)vec_pack(psum, psum);\
00066 vfdst = vec_perm(vdst, ppsum, fperm);\
00067 \
00068 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00069 \
00070 vec_st(fsum, 0, dst);\
00071 \
00072 dst += stride;\
00073 src += stride;
00074
00075 #define noop(a) a
00076 #define add28(a) vec_add(v28ss, a)
00077
00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00079 int stride, int h, int x, int y) {
00080 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
00081 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00082 {((8 - x) * (8 - y)),
00083 (( x) * (8 - y)),
00084 ((8 - x) * ( y)),
00085 (( x) * ( y))};
00086 register int i;
00087 vec_u8 fperm;
00088 const vec_s32 vABCD = vec_ld(0, ABCD);
00089 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00090 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00091 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00092 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00093 LOAD_ZERO;
00094 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00095 const vec_u16 v6us = vec_splat_u16(6);
00096 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00097 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00098
00099 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00100 vec_u8 vsrc0uc, vsrc1uc;
00101 vec_s16 vsrc0ssH, vsrc1ssH;
00102 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00103 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00104 vec_u8 vdst, ppsum, vfdst, fsum;
00105
00106 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00107
00108 if (((unsigned long)dst) % 16 == 0) {
00109 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00110 0x14, 0x15, 0x16, 0x17,
00111 0x08, 0x09, 0x0A, 0x0B,
00112 0x0C, 0x0D, 0x0E, 0x0F};
00113 } else {
00114 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00115 0x04, 0x05, 0x06, 0x07,
00116 0x18, 0x19, 0x1A, 0x1B,
00117 0x1C, 0x1D, 0x1E, 0x1F};
00118 }
00119
00120 vsrcAuc = vec_ld(0, src);
00121
00122 if (loadSecond)
00123 vsrcBuc = vec_ld(16, src);
00124 vsrcperm0 = vec_lvsl(0, src);
00125 vsrcperm1 = vec_lvsl(1, src);
00126
00127 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00128 if (reallyBadAlign)
00129 vsrc1uc = vsrcBuc;
00130 else
00131 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00132
00133 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00134 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00135
00136 if (ABCD[3]) {
00137 if (!loadSecond) {
00138 for (i = 0 ; i < h ; i++) {
00139 vsrcCuc = vec_ld(stride + 0, src);
00140 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00141 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00142
00143 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00144 }
00145 } else {
00146 vec_u8 vsrcDuc;
00147 for (i = 0 ; i < h ; i++) {
00148 vsrcCuc = vec_ld(stride + 0, src);
00149 vsrcDuc = vec_ld(stride + 16, src);
00150 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00151 if (reallyBadAlign)
00152 vsrc3uc = vsrcDuc;
00153 else
00154 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00155
00156 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00157 }
00158 }
00159 } else {
00160 const vec_s16 vE = vec_add(vB, vC);
00161 if (ABCD[2]) {
00162 if (!loadSecond) {
00163 for (i = 0 ; i < h ; i++) {
00164 vsrcCuc = vec_ld(stride + 0, src);
00165 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00166 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00167
00168 vsrc0uc = vsrc1uc;
00169 }
00170 } else {
00171 vec_u8 vsrcDuc;
00172 for (i = 0 ; i < h ; i++) {
00173 vsrcCuc = vec_ld(stride + 0, src);
00174 vsrcDuc = vec_ld(stride + 15, src);
00175 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00176 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00177
00178 vsrc0uc = vsrc1uc;
00179 }
00180 }
00181 } else {
00182 if (!loadSecond) {
00183 for (i = 0 ; i < h ; i++) {
00184 vsrcCuc = vec_ld(0, src);
00185 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00186 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00187
00188 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00189 }
00190 } else {
00191 vec_u8 vsrcDuc;
00192 for (i = 0 ; i < h ; i++) {
00193 vsrcCuc = vec_ld(0, src);
00194 vsrcDuc = vec_ld(15, src);
00195 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00196 if (reallyBadAlign)
00197 vsrc1uc = vsrcDuc;
00198 else
00199 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00200
00201 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00202 }
00203 }
00204 }
00205 }
00206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00207 }
00208
00209
00210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00211 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00212 {((8 - x) * (8 - y)),
00213 (( x) * (8 - y)),
00214 ((8 - x) * ( y)),
00215 (( x) * ( y))};
00216 register int i;
00217 vec_u8 fperm;
00218 const vec_s32 vABCD = vec_ld(0, ABCD);
00219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00223 LOAD_ZERO;
00224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00225 const vec_u16 v6us = vec_splat_u16(6);
00226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00228
00229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00230 vec_u8 vsrc0uc, vsrc1uc;
00231 vec_s16 vsrc0ssH, vsrc1ssH;
00232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00234 vec_u8 vdst, ppsum, vfdst, fsum;
00235
00236 if (((unsigned long)dst) % 16 == 0) {
00237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00238 0x14, 0x15, 0x16, 0x17,
00239 0x08, 0x09, 0x0A, 0x0B,
00240 0x0C, 0x0D, 0x0E, 0x0F};
00241 } else {
00242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00243 0x04, 0x05, 0x06, 0x07,
00244 0x18, 0x19, 0x1A, 0x1B,
00245 0x1C, 0x1D, 0x1E, 0x1F};
00246 }
00247
00248 vsrcAuc = vec_ld(0, src);
00249
00250 if (loadSecond)
00251 vsrcBuc = vec_ld(16, src);
00252 vsrcperm0 = vec_lvsl(0, src);
00253 vsrcperm1 = vec_lvsl(1, src);
00254
00255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00256 if (reallyBadAlign)
00257 vsrc1uc = vsrcBuc;
00258 else
00259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00260
00261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00263
00264 if (!loadSecond) {
00265 for (i = 0 ; i < h ; i++) {
00266
00267
00268 vsrcCuc = vec_ld(stride + 0, src);
00269
00270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00272
00273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00274 }
00275 } else {
00276 vec_u8 vsrcDuc;
00277 for (i = 0 ; i < h ; i++) {
00278 vsrcCuc = vec_ld(stride + 0, src);
00279 vsrcDuc = vec_ld(stride + 16, src);
00280
00281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00282 if (reallyBadAlign)
00283 vsrc3uc = vsrcDuc;
00284 else
00285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00286
00287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00288 }
00289 }
00290 }
00291
00292 #undef noop
00293 #undef add28
00294 #undef CHROMA_MC8_ALTIVEC_CORE
00295
00296
00297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
00299 register int i;
00300
00301 LOAD_ZERO;
00302 const vec_u8 permM2 = vec_lvsl(-2, src);
00303 const vec_u8 permM1 = vec_lvsl(-1, src);
00304 const vec_u8 permP0 = vec_lvsl(+0, src);
00305 const vec_u8 permP1 = vec_lvsl(+1, src);
00306 const vec_u8 permP2 = vec_lvsl(+2, src);
00307 const vec_u8 permP3 = vec_lvsl(+3, src);
00308 const vec_s16 v5ss = vec_splat_s16(5);
00309 const vec_u16 v5us = vec_splat_u16(5);
00310 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00311 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00312
00313 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00314
00315 register int align = ((((unsigned long)src) - 2) % 16);
00316
00317 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00318 srcP2A, srcP2B, srcP3A, srcP3B,
00319 srcM1A, srcM1B, srcM2A, srcM2B,
00320 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00321 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00322 psumA, psumB, sumA, sumB;
00323
00324 vec_u8 sum, vdst, fsum;
00325
00326 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00327
00328 for (i = 0 ; i < 16 ; i ++) {
00329 vec_u8 srcR1 = vec_ld(-2, src);
00330 vec_u8 srcR2 = vec_ld(14, src);
00331
00332 switch (align) {
00333 default: {
00334 srcM2 = vec_perm(srcR1, srcR2, permM2);
00335 srcM1 = vec_perm(srcR1, srcR2, permM1);
00336 srcP0 = vec_perm(srcR1, srcR2, permP0);
00337 srcP1 = vec_perm(srcR1, srcR2, permP1);
00338 srcP2 = vec_perm(srcR1, srcR2, permP2);
00339 srcP3 = vec_perm(srcR1, srcR2, permP3);
00340 } break;
00341 case 11: {
00342 srcM2 = vec_perm(srcR1, srcR2, permM2);
00343 srcM1 = vec_perm(srcR1, srcR2, permM1);
00344 srcP0 = vec_perm(srcR1, srcR2, permP0);
00345 srcP1 = vec_perm(srcR1, srcR2, permP1);
00346 srcP2 = vec_perm(srcR1, srcR2, permP2);
00347 srcP3 = srcR2;
00348 } break;
00349 case 12: {
00350 vec_u8 srcR3 = vec_ld(30, src);
00351 srcM2 = vec_perm(srcR1, srcR2, permM2);
00352 srcM1 = vec_perm(srcR1, srcR2, permM1);
00353 srcP0 = vec_perm(srcR1, srcR2, permP0);
00354 srcP1 = vec_perm(srcR1, srcR2, permP1);
00355 srcP2 = srcR2;
00356 srcP3 = vec_perm(srcR2, srcR3, permP3);
00357 } break;
00358 case 13: {
00359 vec_u8 srcR3 = vec_ld(30, src);
00360 srcM2 = vec_perm(srcR1, srcR2, permM2);
00361 srcM1 = vec_perm(srcR1, srcR2, permM1);
00362 srcP0 = vec_perm(srcR1, srcR2, permP0);
00363 srcP1 = srcR2;
00364 srcP2 = vec_perm(srcR2, srcR3, permP2);
00365 srcP3 = vec_perm(srcR2, srcR3, permP3);
00366 } break;
00367 case 14: {
00368 vec_u8 srcR3 = vec_ld(30, src);
00369 srcM2 = vec_perm(srcR1, srcR2, permM2);
00370 srcM1 = vec_perm(srcR1, srcR2, permM1);
00371 srcP0 = srcR2;
00372 srcP1 = vec_perm(srcR2, srcR3, permP1);
00373 srcP2 = vec_perm(srcR2, srcR3, permP2);
00374 srcP3 = vec_perm(srcR2, srcR3, permP3);
00375 } break;
00376 case 15: {
00377 vec_u8 srcR3 = vec_ld(30, src);
00378 srcM2 = vec_perm(srcR1, srcR2, permM2);
00379 srcM1 = srcR2;
00380 srcP0 = vec_perm(srcR2, srcR3, permP0);
00381 srcP1 = vec_perm(srcR2, srcR3, permP1);
00382 srcP2 = vec_perm(srcR2, srcR3, permP2);
00383 srcP3 = vec_perm(srcR2, srcR3, permP3);
00384 } break;
00385 }
00386
00387 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00388 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00389 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00390 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00391
00392 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00393 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00394 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00395 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00396
00397 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00398 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00399 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00400 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00401
00402 sum1A = vec_adds(srcP0A, srcP1A);
00403 sum1B = vec_adds(srcP0B, srcP1B);
00404 sum2A = vec_adds(srcM1A, srcP2A);
00405 sum2B = vec_adds(srcM1B, srcP2B);
00406 sum3A = vec_adds(srcM2A, srcP3A);
00407 sum3B = vec_adds(srcM2B, srcP3B);
00408
00409 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00410 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00411
00412 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00413 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00414
00415 pp3A = vec_add(sum3A, pp1A);
00416 pp3B = vec_add(sum3B, pp1B);
00417
00418 psumA = vec_sub(pp3A, pp2A);
00419 psumB = vec_sub(pp3B, pp2B);
00420
00421 sumA = vec_sra(psumA, v5us);
00422 sumB = vec_sra(psumB, v5us);
00423
00424 sum = vec_packsu(sumA, sumB);
00425
00426 ASSERT_ALIGNED(dst);
00427 vdst = vec_ld(0, dst);
00428
00429 OP_U8_ALTIVEC(fsum, sum, vdst);
00430
00431 vec_st(fsum, 0, dst);
00432
00433 src += srcStride;
00434 dst += dstStride;
00435 }
00436 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00437 }
00438
00439
00440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00441 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
00442
00443 register int i;
00444
00445 LOAD_ZERO;
00446 const vec_u8 perm = vec_lvsl(0, src);
00447 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00448 const vec_u16 v5us = vec_splat_u16(5);
00449 const vec_s16 v5ss = vec_splat_s16(5);
00450 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00451
00452 uint8_t *srcbis = src - (srcStride * 2);
00453
00454 const vec_u8 srcM2a = vec_ld(0, srcbis);
00455 const vec_u8 srcM2b = vec_ld(16, srcbis);
00456 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00457
00458 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00459 const vec_u8 srcM1b = vec_ld(16, srcbis);
00460 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00461
00462 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00463 const vec_u8 srcP0b = vec_ld(16, srcbis);
00464 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00465
00466 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00467 const vec_u8 srcP1b = vec_ld(16, srcbis);
00468 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00469
00470 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00471 const vec_u8 srcP2b = vec_ld(16, srcbis);
00472 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00473
00474
00475 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00476 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00477 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00478 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00479 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00480 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00481 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00482 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00483 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00484 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00485
00486 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00487 psumA, psumB, sumA, sumB,
00488 srcP3ssA, srcP3ssB,
00489 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00490
00491 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00492
00493 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00494
00495 for (i = 0 ; i < 16 ; i++) {
00496 srcP3a = vec_ld(0, srcbis += srcStride);
00497 srcP3b = vec_ld(16, srcbis);
00498 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00499 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00500 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00501
00502
00503 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00504 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00505 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00506 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00507 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00508 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00509
00510 srcM2ssA = srcM1ssA;
00511 srcM2ssB = srcM1ssB;
00512 srcM1ssA = srcP0ssA;
00513 srcM1ssB = srcP0ssB;
00514 srcP0ssA = srcP1ssA;
00515 srcP0ssB = srcP1ssB;
00516 srcP1ssA = srcP2ssA;
00517 srcP1ssB = srcP2ssB;
00518 srcP2ssA = srcP3ssA;
00519 srcP2ssB = srcP3ssB;
00520
00521 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00522 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00523
00524 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00525 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00526
00527 pp3A = vec_add(sum3A, pp1A);
00528 pp3B = vec_add(sum3B, pp1B);
00529
00530 psumA = vec_sub(pp3A, pp2A);
00531 psumB = vec_sub(pp3B, pp2B);
00532
00533 sumA = vec_sra(psumA, v5us);
00534 sumB = vec_sra(psumB, v5us);
00535
00536 sum = vec_packsu(sumA, sumB);
00537
00538 ASSERT_ALIGNED(dst);
00539 vdst = vec_ld(0, dst);
00540
00541 OP_U8_ALTIVEC(fsum, sum, vdst);
00542
00543 vec_st(fsum, 0, dst);
00544
00545 dst += dstStride;
00546 }
00547 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00548 }
00549
00550
00551 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00552 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00553 register int i;
00554 LOAD_ZERO;
00555 const vec_u8 permM2 = vec_lvsl(-2, src);
00556 const vec_u8 permM1 = vec_lvsl(-1, src);
00557 const vec_u8 permP0 = vec_lvsl(+0, src);
00558 const vec_u8 permP1 = vec_lvsl(+1, src);
00559 const vec_u8 permP2 = vec_lvsl(+2, src);
00560 const vec_u8 permP3 = vec_lvsl(+3, src);
00561 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00562 const vec_u32 v10ui = vec_splat_u32(10);
00563 const vec_s16 v5ss = vec_splat_s16(5);
00564 const vec_s16 v1ss = vec_splat_s16(1);
00565 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00566 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00567
00568 register int align = ((((unsigned long)src) - 2) % 16);
00569
00570 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00571 srcP2A, srcP2B, srcP3A, srcP3B,
00572 srcM1A, srcM1B, srcM2A, srcM2B,
00573 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00574 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00575
00576 const vec_u8 mperm = (const vec_u8)
00577 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00578 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00579 int16_t *tmpbis = tmp;
00580
00581 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00582 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00583 tmpP2ssA, tmpP2ssB;
00584
00585 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00586 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00587 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00588 ssumAe, ssumAo, ssumBe, ssumBo;
00589 vec_u8 fsum, sumv, sum, vdst;
00590 vec_s16 ssume, ssumo;
00591
00592 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00593 src -= (2 * srcStride);
00594 for (i = 0 ; i < 21 ; i ++) {
00595 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00596 vec_u8 srcR1 = vec_ld(-2, src);
00597 vec_u8 srcR2 = vec_ld(14, src);
00598
00599 switch (align) {
00600 default: {
00601 srcM2 = vec_perm(srcR1, srcR2, permM2);
00602 srcM1 = vec_perm(srcR1, srcR2, permM1);
00603 srcP0 = vec_perm(srcR1, srcR2, permP0);
00604 srcP1 = vec_perm(srcR1, srcR2, permP1);
00605 srcP2 = vec_perm(srcR1, srcR2, permP2);
00606 srcP3 = vec_perm(srcR1, srcR2, permP3);
00607 } break;
00608 case 11: {
00609 srcM2 = vec_perm(srcR1, srcR2, permM2);
00610 srcM1 = vec_perm(srcR1, srcR2, permM1);
00611 srcP0 = vec_perm(srcR1, srcR2, permP0);
00612 srcP1 = vec_perm(srcR1, srcR2, permP1);
00613 srcP2 = vec_perm(srcR1, srcR2, permP2);
00614 srcP3 = srcR2;
00615 } break;
00616 case 12: {
00617 vec_u8 srcR3 = vec_ld(30, src);
00618 srcM2 = vec_perm(srcR1, srcR2, permM2);
00619 srcM1 = vec_perm(srcR1, srcR2, permM1);
00620 srcP0 = vec_perm(srcR1, srcR2, permP0);
00621 srcP1 = vec_perm(srcR1, srcR2, permP1);
00622 srcP2 = srcR2;
00623 srcP3 = vec_perm(srcR2, srcR3, permP3);
00624 } break;
00625 case 13: {
00626 vec_u8 srcR3 = vec_ld(30, src);
00627 srcM2 = vec_perm(srcR1, srcR2, permM2);
00628 srcM1 = vec_perm(srcR1, srcR2, permM1);
00629 srcP0 = vec_perm(srcR1, srcR2, permP0);
00630 srcP1 = srcR2;
00631 srcP2 = vec_perm(srcR2, srcR3, permP2);
00632 srcP3 = vec_perm(srcR2, srcR3, permP3);
00633 } break;
00634 case 14: {
00635 vec_u8 srcR3 = vec_ld(30, src);
00636 srcM2 = vec_perm(srcR1, srcR2, permM2);
00637 srcM1 = vec_perm(srcR1, srcR2, permM1);
00638 srcP0 = srcR2;
00639 srcP1 = vec_perm(srcR2, srcR3, permP1);
00640 srcP2 = vec_perm(srcR2, srcR3, permP2);
00641 srcP3 = vec_perm(srcR2, srcR3, permP3);
00642 } break;
00643 case 15: {
00644 vec_u8 srcR3 = vec_ld(30, src);
00645 srcM2 = vec_perm(srcR1, srcR2, permM2);
00646 srcM1 = srcR2;
00647 srcP0 = vec_perm(srcR2, srcR3, permP0);
00648 srcP1 = vec_perm(srcR2, srcR3, permP1);
00649 srcP2 = vec_perm(srcR2, srcR3, permP2);
00650 srcP3 = vec_perm(srcR2, srcR3, permP3);
00651 } break;
00652 }
00653
00654 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00655 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00656 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00657 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00658
00659 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00660 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00661 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00662 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00663
00664 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00665 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00666 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00667 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00668
00669 sum1A = vec_adds(srcP0A, srcP1A);
00670 sum1B = vec_adds(srcP0B, srcP1B);
00671 sum2A = vec_adds(srcM1A, srcP2A);
00672 sum2B = vec_adds(srcM1B, srcP2B);
00673 sum3A = vec_adds(srcM2A, srcP3A);
00674 sum3B = vec_adds(srcM2B, srcP3B);
00675
00676 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00677 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00678
00679 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00680 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00681
00682 psumA = vec_sub(pp1A, pp2A);
00683 psumB = vec_sub(pp1B, pp2B);
00684
00685 vec_st(psumA, 0, tmp);
00686 vec_st(psumB, 16, tmp);
00687
00688 src += srcStride;
00689 tmp += tmpStride;
00690 }
00691
00692 tmpM2ssA = vec_ld(0, tmpbis);
00693 tmpM2ssB = vec_ld(16, tmpbis);
00694 tmpbis += tmpStride;
00695 tmpM1ssA = vec_ld(0, tmpbis);
00696 tmpM1ssB = vec_ld(16, tmpbis);
00697 tmpbis += tmpStride;
00698 tmpP0ssA = vec_ld(0, tmpbis);
00699 tmpP0ssB = vec_ld(16, tmpbis);
00700 tmpbis += tmpStride;
00701 tmpP1ssA = vec_ld(0, tmpbis);
00702 tmpP1ssB = vec_ld(16, tmpbis);
00703 tmpbis += tmpStride;
00704 tmpP2ssA = vec_ld(0, tmpbis);
00705 tmpP2ssB = vec_ld(16, tmpbis);
00706 tmpbis += tmpStride;
00707
00708 for (i = 0 ; i < 16 ; i++) {
00709 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00710 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00711
00712 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00713 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00714 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00715 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00716 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00717 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00718
00719 tmpbis += tmpStride;
00720
00721 tmpM2ssA = tmpM1ssA;
00722 tmpM2ssB = tmpM1ssB;
00723 tmpM1ssA = tmpP0ssA;
00724 tmpM1ssB = tmpP0ssB;
00725 tmpP0ssA = tmpP1ssA;
00726 tmpP0ssB = tmpP1ssB;
00727 tmpP1ssA = tmpP2ssA;
00728 tmpP1ssB = tmpP2ssB;
00729 tmpP2ssA = tmpP3ssA;
00730 tmpP2ssB = tmpP3ssB;
00731
00732 pp1Ae = vec_mule(sum1A, v20ss);
00733 pp1Ao = vec_mulo(sum1A, v20ss);
00734 pp1Be = vec_mule(sum1B, v20ss);
00735 pp1Bo = vec_mulo(sum1B, v20ss);
00736
00737 pp2Ae = vec_mule(sum2A, v5ss);
00738 pp2Ao = vec_mulo(sum2A, v5ss);
00739 pp2Be = vec_mule(sum2B, v5ss);
00740 pp2Bo = vec_mulo(sum2B, v5ss);
00741
00742 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00743 pp3Ao = vec_mulo(sum3A, v1ss);
00744 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00745 pp3Bo = vec_mulo(sum3B, v1ss);
00746
00747 pp1cAe = vec_add(pp1Ae, v512si);
00748 pp1cAo = vec_add(pp1Ao, v512si);
00749 pp1cBe = vec_add(pp1Be, v512si);
00750 pp1cBo = vec_add(pp1Bo, v512si);
00751
00752 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00753 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00754 pp32Be = vec_sub(pp3Be, pp2Be);
00755 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00756
00757 sumAe = vec_add(pp1cAe, pp32Ae);
00758 sumAo = vec_add(pp1cAo, pp32Ao);
00759 sumBe = vec_add(pp1cBe, pp32Be);
00760 sumBo = vec_add(pp1cBo, pp32Bo);
00761
00762 ssumAe = vec_sra(sumAe, v10ui);
00763 ssumAo = vec_sra(sumAo, v10ui);
00764 ssumBe = vec_sra(sumBe, v10ui);
00765 ssumBo = vec_sra(sumBo, v10ui);
00766
00767 ssume = vec_packs(ssumAe, ssumBe);
00768 ssumo = vec_packs(ssumAo, ssumBo);
00769
00770 sumv = vec_packsu(ssume, ssumo);
00771 sum = vec_perm(sumv, sumv, mperm);
00772
00773 ASSERT_ALIGNED(dst);
00774 vdst = vec_ld(0, dst);
00775
00776 OP_U8_ALTIVEC(fsum, sum, vdst);
00777
00778 vec_st(fsum, 0, dst);
00779
00780 dst += dstStride;
00781 }
00782 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00783 }