00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifdef DEBUG_ALIGNMENT
00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00024 #else
00025 #define ASSERT_ALIGNED(ptr) ;
00026 #endif
00027
00028
00029
00030 #define CHROMA_MC8_ALTIVEC_CORE \
00031 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00032 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00033 \
00034 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00035 psum = vec_mladd(vB, vsrc1ssH, psum);\
00036 psum = vec_mladd(vC, vsrc2ssH, psum);\
00037 psum = vec_mladd(vD, vsrc3ssH, psum);\
00038 psum = vec_sr(psum, v6us);\
00039 \
00040 vdst = vec_ld(0, dst);\
00041 ppsum = (vec_u8)vec_pack(psum, psum);\
00042 vfdst = vec_perm(vdst, ppsum, fperm);\
00043 \
00044 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00045 \
00046 vec_st(fsum, 0, dst);\
00047 \
00048 vsrc0ssH = vsrc2ssH;\
00049 vsrc1ssH = vsrc3ssH;\
00050 \
00051 dst += stride;\
00052 src += stride;
00053
00054 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00055 \
00056 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00057 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00058 \
00059 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00060 psum = vec_mladd(vE, vsrc1ssH, psum);\
00061 psum = vec_sr(psum, v6us);\
00062 \
00063 vdst = vec_ld(0, dst);\
00064 ppsum = (vec_u8)vec_pack(psum, psum);\
00065 vfdst = vec_perm(vdst, ppsum, fperm);\
00066 \
00067 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00068 \
00069 vec_st(fsum, 0, dst);\
00070 \
00071 dst += stride;\
00072 src += stride;
00073
00074 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00075 int stride, int h, int x, int y) {
00076 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
00077 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
00078 {((8 - x) * (8 - y)),
00079 (( x) * (8 - y)),
00080 ((8 - x) * ( y)),
00081 (( x) * ( y))};
00082 register int i;
00083 vec_u8 fperm;
00084 const vec_s32 vABCD = vec_ld(0, ABCD);
00085 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00086 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00087 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00088 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00089 LOAD_ZERO;
00090 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00091 const vec_u16 v6us = vec_splat_u16(6);
00092 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00093 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00094
00095 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
00096 vec_u8 vsrc0uc, vsrc1uc;
00097 vec_s16 vsrc0ssH, vsrc1ssH;
00098 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00099 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00100 vec_u8 vdst, ppsum, vfdst, fsum;
00101
00102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00103
00104 if (((unsigned long)dst) % 16 == 0) {
00105 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00106 0x14, 0x15, 0x16, 0x17,
00107 0x08, 0x09, 0x0A, 0x0B,
00108 0x0C, 0x0D, 0x0E, 0x0F};
00109 } else {
00110 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00111 0x04, 0x05, 0x06, 0x07,
00112 0x18, 0x19, 0x1A, 0x1B,
00113 0x1C, 0x1D, 0x1E, 0x1F};
00114 }
00115
00116 vsrcAuc = vec_ld(0, src);
00117
00118 if (loadSecond)
00119 vsrcBuc = vec_ld(16, src);
00120 vsrcperm0 = vec_lvsl(0, src);
00121 vsrcperm1 = vec_lvsl(1, src);
00122
00123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00124 if (reallyBadAlign)
00125 vsrc1uc = vsrcBuc;
00126 else
00127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00128
00129 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00130 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00131
00132 if (ABCD[3]) {
00133 if (!loadSecond) {
00134 for (i = 0 ; i < h ; i++) {
00135 vsrcCuc = vec_ld(stride + 0, src);
00136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00138
00139 CHROMA_MC8_ALTIVEC_CORE
00140 }
00141 } else {
00142 vec_u8 vsrcDuc;
00143 for (i = 0 ; i < h ; i++) {
00144 vsrcCuc = vec_ld(stride + 0, src);
00145 vsrcDuc = vec_ld(stride + 16, src);
00146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00147 if (reallyBadAlign)
00148 vsrc3uc = vsrcDuc;
00149 else
00150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00151
00152 CHROMA_MC8_ALTIVEC_CORE
00153 }
00154 }
00155 } else {
00156 const vec_s16 vE = vec_add(vB, vC);
00157 if (ABCD[2]) {
00158 if (!loadSecond) {
00159 for (i = 0 ; i < h ; i++) {
00160 vsrcCuc = vec_ld(stride + 0, src);
00161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00163
00164 vsrc0uc = vsrc1uc;
00165 }
00166 } else {
00167 vec_u8 vsrcDuc;
00168 for (i = 0 ; i < h ; i++) {
00169 vsrcCuc = vec_ld(stride + 0, src);
00170 vsrcDuc = vec_ld(stride + 15, src);
00171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00173
00174 vsrc0uc = vsrc1uc;
00175 }
00176 }
00177 } else {
00178 if (!loadSecond) {
00179 for (i = 0 ; i < h ; i++) {
00180 vsrcCuc = vec_ld(0, src);
00181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00183
00184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00185 }
00186 } else {
00187 vec_u8 vsrcDuc;
00188 for (i = 0 ; i < h ; i++) {
00189 vsrcCuc = vec_ld(0, src);
00190 vsrcDuc = vec_ld(15, src);
00191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00192 if (reallyBadAlign)
00193 vsrc1uc = vsrcDuc;
00194 else
00195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00196
00197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00198 }
00199 }
00200 }
00201 }
00202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00203 }
00204
00205 #undef CHROMA_MC8_ALTIVEC_CORE
00206
00207
00208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
00210 register int i;
00211
00212 LOAD_ZERO;
00213 const vec_u8 permM2 = vec_lvsl(-2, src);
00214 const vec_u8 permM1 = vec_lvsl(-1, src);
00215 const vec_u8 permP0 = vec_lvsl(+0, src);
00216 const vec_u8 permP1 = vec_lvsl(+1, src);
00217 const vec_u8 permP2 = vec_lvsl(+2, src);
00218 const vec_u8 permP3 = vec_lvsl(+3, src);
00219 const vec_s16 v5ss = vec_splat_s16(5);
00220 const vec_u16 v5us = vec_splat_u16(5);
00221 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00222 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00223
00224 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00225
00226 register int align = ((((unsigned long)src) - 2) % 16);
00227
00228 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00229 srcP2A, srcP2B, srcP3A, srcP3B,
00230 srcM1A, srcM1B, srcM2A, srcM2B,
00231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00233 psumA, psumB, sumA, sumB;
00234
00235 vec_u8 sum, vdst, fsum;
00236
00237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00238
00239 for (i = 0 ; i < 16 ; i ++) {
00240 vec_u8 srcR1 = vec_ld(-2, src);
00241 vec_u8 srcR2 = vec_ld(14, src);
00242
00243 switch (align) {
00244 default: {
00245 srcM2 = vec_perm(srcR1, srcR2, permM2);
00246 srcM1 = vec_perm(srcR1, srcR2, permM1);
00247 srcP0 = vec_perm(srcR1, srcR2, permP0);
00248 srcP1 = vec_perm(srcR1, srcR2, permP1);
00249 srcP2 = vec_perm(srcR1, srcR2, permP2);
00250 srcP3 = vec_perm(srcR1, srcR2, permP3);
00251 } break;
00252 case 11: {
00253 srcM2 = vec_perm(srcR1, srcR2, permM2);
00254 srcM1 = vec_perm(srcR1, srcR2, permM1);
00255 srcP0 = vec_perm(srcR1, srcR2, permP0);
00256 srcP1 = vec_perm(srcR1, srcR2, permP1);
00257 srcP2 = vec_perm(srcR1, srcR2, permP2);
00258 srcP3 = srcR2;
00259 } break;
00260 case 12: {
00261 vec_u8 srcR3 = vec_ld(30, src);
00262 srcM2 = vec_perm(srcR1, srcR2, permM2);
00263 srcM1 = vec_perm(srcR1, srcR2, permM1);
00264 srcP0 = vec_perm(srcR1, srcR2, permP0);
00265 srcP1 = vec_perm(srcR1, srcR2, permP1);
00266 srcP2 = srcR2;
00267 srcP3 = vec_perm(srcR2, srcR3, permP3);
00268 } break;
00269 case 13: {
00270 vec_u8 srcR3 = vec_ld(30, src);
00271 srcM2 = vec_perm(srcR1, srcR2, permM2);
00272 srcM1 = vec_perm(srcR1, srcR2, permM1);
00273 srcP0 = vec_perm(srcR1, srcR2, permP0);
00274 srcP1 = srcR2;
00275 srcP2 = vec_perm(srcR2, srcR3, permP2);
00276 srcP3 = vec_perm(srcR2, srcR3, permP3);
00277 } break;
00278 case 14: {
00279 vec_u8 srcR3 = vec_ld(30, src);
00280 srcM2 = vec_perm(srcR1, srcR2, permM2);
00281 srcM1 = vec_perm(srcR1, srcR2, permM1);
00282 srcP0 = srcR2;
00283 srcP1 = vec_perm(srcR2, srcR3, permP1);
00284 srcP2 = vec_perm(srcR2, srcR3, permP2);
00285 srcP3 = vec_perm(srcR2, srcR3, permP3);
00286 } break;
00287 case 15: {
00288 vec_u8 srcR3 = vec_ld(30, src);
00289 srcM2 = vec_perm(srcR1, srcR2, permM2);
00290 srcM1 = srcR2;
00291 srcP0 = vec_perm(srcR2, srcR3, permP0);
00292 srcP1 = vec_perm(srcR2, srcR3, permP1);
00293 srcP2 = vec_perm(srcR2, srcR3, permP2);
00294 srcP3 = vec_perm(srcR2, srcR3, permP3);
00295 } break;
00296 }
00297
00298 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00299 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00300 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00301 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00302
00303 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00304 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00305 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00306 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00307
00308 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00309 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00310 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00311 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00312
00313 sum1A = vec_adds(srcP0A, srcP1A);
00314 sum1B = vec_adds(srcP0B, srcP1B);
00315 sum2A = vec_adds(srcM1A, srcP2A);
00316 sum2B = vec_adds(srcM1B, srcP2B);
00317 sum3A = vec_adds(srcM2A, srcP3A);
00318 sum3B = vec_adds(srcM2B, srcP3B);
00319
00320 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00321 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00322
00323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00325
00326 pp3A = vec_add(sum3A, pp1A);
00327 pp3B = vec_add(sum3B, pp1B);
00328
00329 psumA = vec_sub(pp3A, pp2A);
00330 psumB = vec_sub(pp3B, pp2B);
00331
00332 sumA = vec_sra(psumA, v5us);
00333 sumB = vec_sra(psumB, v5us);
00334
00335 sum = vec_packsu(sumA, sumB);
00336
00337 ASSERT_ALIGNED(dst);
00338 vdst = vec_ld(0, dst);
00339
00340 OP_U8_ALTIVEC(fsum, sum, vdst);
00341
00342 vec_st(fsum, 0, dst);
00343
00344 src += srcStride;
00345 dst += dstStride;
00346 }
00347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00348 }
00349
00350
00351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
00353
00354 register int i;
00355
00356 LOAD_ZERO;
00357 const vec_u8 perm = vec_lvsl(0, src);
00358 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00359 const vec_u16 v5us = vec_splat_u16(5);
00360 const vec_s16 v5ss = vec_splat_s16(5);
00361 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00362
00363 uint8_t *srcbis = src - (srcStride * 2);
00364
00365 const vec_u8 srcM2a = vec_ld(0, srcbis);
00366 const vec_u8 srcM2b = vec_ld(16, srcbis);
00367 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00368
00369 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00370 const vec_u8 srcM1b = vec_ld(16, srcbis);
00371 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00372
00373 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00374 const vec_u8 srcP0b = vec_ld(16, srcbis);
00375 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00376
00377 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00378 const vec_u8 srcP1b = vec_ld(16, srcbis);
00379 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00380
00381 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00382 const vec_u8 srcP2b = vec_ld(16, srcbis);
00383 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00384
00385
00386 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00387 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00388 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00389 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00390 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00391 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00392 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00393 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00394 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00395 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00396
00397 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00398 psumA, psumB, sumA, sumB,
00399 srcP3ssA, srcP3ssB,
00400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00401
00402 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00403
00404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00405
00406 for (i = 0 ; i < 16 ; i++) {
00407 srcP3a = vec_ld(0, srcbis += srcStride);
00408 srcP3b = vec_ld(16, srcbis);
00409 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00410 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00411 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00412
00413
00414 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00415 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00416 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00417 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00418 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00419 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00420
00421 srcM2ssA = srcM1ssA;
00422 srcM2ssB = srcM1ssB;
00423 srcM1ssA = srcP0ssA;
00424 srcM1ssB = srcP0ssB;
00425 srcP0ssA = srcP1ssA;
00426 srcP0ssB = srcP1ssB;
00427 srcP1ssA = srcP2ssA;
00428 srcP1ssB = srcP2ssB;
00429 srcP2ssA = srcP3ssA;
00430 srcP2ssB = srcP3ssB;
00431
00432 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00433 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00434
00435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00437
00438 pp3A = vec_add(sum3A, pp1A);
00439 pp3B = vec_add(sum3B, pp1B);
00440
00441 psumA = vec_sub(pp3A, pp2A);
00442 psumB = vec_sub(pp3B, pp2B);
00443
00444 sumA = vec_sra(psumA, v5us);
00445 sumB = vec_sra(psumB, v5us);
00446
00447 sum = vec_packsu(sumA, sumB);
00448
00449 ASSERT_ALIGNED(dst);
00450 vdst = vec_ld(0, dst);
00451
00452 OP_U8_ALTIVEC(fsum, sum, vdst);
00453
00454 vec_st(fsum, 0, dst);
00455
00456 dst += dstStride;
00457 }
00458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00459 }
00460
00461
00462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00464 register int i;
00465 LOAD_ZERO;
00466 const vec_u8 permM2 = vec_lvsl(-2, src);
00467 const vec_u8 permM1 = vec_lvsl(-1, src);
00468 const vec_u8 permP0 = vec_lvsl(+0, src);
00469 const vec_u8 permP1 = vec_lvsl(+1, src);
00470 const vec_u8 permP2 = vec_lvsl(+2, src);
00471 const vec_u8 permP3 = vec_lvsl(+3, src);
00472 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00473 const vec_u32 v10ui = vec_splat_u32(10);
00474 const vec_s16 v5ss = vec_splat_s16(5);
00475 const vec_s16 v1ss = vec_splat_s16(1);
00476 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00477 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00478
00479 register int align = ((((unsigned long)src) - 2) % 16);
00480
00481 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00482 srcP2A, srcP2B, srcP3A, srcP3B,
00483 srcM1A, srcM1B, srcM2A, srcM2B,
00484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00486
00487 const vec_u8 mperm = (const vec_u8)
00488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00490 int16_t *tmpbis = tmp;
00491
00492 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00494 tmpP2ssA, tmpP2ssB;
00495
00496 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00499 ssumAe, ssumAo, ssumBe, ssumBo;
00500 vec_u8 fsum, sumv, sum, vdst;
00501 vec_s16 ssume, ssumo;
00502
00503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00504 src -= (2 * srcStride);
00505 for (i = 0 ; i < 21 ; i ++) {
00506 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00507 vec_u8 srcR1 = vec_ld(-2, src);
00508 vec_u8 srcR2 = vec_ld(14, src);
00509
00510 switch (align) {
00511 default: {
00512 srcM2 = vec_perm(srcR1, srcR2, permM2);
00513 srcM1 = vec_perm(srcR1, srcR2, permM1);
00514 srcP0 = vec_perm(srcR1, srcR2, permP0);
00515 srcP1 = vec_perm(srcR1, srcR2, permP1);
00516 srcP2 = vec_perm(srcR1, srcR2, permP2);
00517 srcP3 = vec_perm(srcR1, srcR2, permP3);
00518 } break;
00519 case 11: {
00520 srcM2 = vec_perm(srcR1, srcR2, permM2);
00521 srcM1 = vec_perm(srcR1, srcR2, permM1);
00522 srcP0 = vec_perm(srcR1, srcR2, permP0);
00523 srcP1 = vec_perm(srcR1, srcR2, permP1);
00524 srcP2 = vec_perm(srcR1, srcR2, permP2);
00525 srcP3 = srcR2;
00526 } break;
00527 case 12: {
00528 vec_u8 srcR3 = vec_ld(30, src);
00529 srcM2 = vec_perm(srcR1, srcR2, permM2);
00530 srcM1 = vec_perm(srcR1, srcR2, permM1);
00531 srcP0 = vec_perm(srcR1, srcR2, permP0);
00532 srcP1 = vec_perm(srcR1, srcR2, permP1);
00533 srcP2 = srcR2;
00534 srcP3 = vec_perm(srcR2, srcR3, permP3);
00535 } break;
00536 case 13: {
00537 vec_u8 srcR3 = vec_ld(30, src);
00538 srcM2 = vec_perm(srcR1, srcR2, permM2);
00539 srcM1 = vec_perm(srcR1, srcR2, permM1);
00540 srcP0 = vec_perm(srcR1, srcR2, permP0);
00541 srcP1 = srcR2;
00542 srcP2 = vec_perm(srcR2, srcR3, permP2);
00543 srcP3 = vec_perm(srcR2, srcR3, permP3);
00544 } break;
00545 case 14: {
00546 vec_u8 srcR3 = vec_ld(30, src);
00547 srcM2 = vec_perm(srcR1, srcR2, permM2);
00548 srcM1 = vec_perm(srcR1, srcR2, permM1);
00549 srcP0 = srcR2;
00550 srcP1 = vec_perm(srcR2, srcR3, permP1);
00551 srcP2 = vec_perm(srcR2, srcR3, permP2);
00552 srcP3 = vec_perm(srcR2, srcR3, permP3);
00553 } break;
00554 case 15: {
00555 vec_u8 srcR3 = vec_ld(30, src);
00556 srcM2 = vec_perm(srcR1, srcR2, permM2);
00557 srcM1 = srcR2;
00558 srcP0 = vec_perm(srcR2, srcR3, permP0);
00559 srcP1 = vec_perm(srcR2, srcR3, permP1);
00560 srcP2 = vec_perm(srcR2, srcR3, permP2);
00561 srcP3 = vec_perm(srcR2, srcR3, permP3);
00562 } break;
00563 }
00564
00565 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00566 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00567 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00568 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00569
00570 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00571 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00572 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00573 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00574
00575 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00576 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00577 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00578 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00579
00580 sum1A = vec_adds(srcP0A, srcP1A);
00581 sum1B = vec_adds(srcP0B, srcP1B);
00582 sum2A = vec_adds(srcM1A, srcP2A);
00583 sum2B = vec_adds(srcM1B, srcP2B);
00584 sum3A = vec_adds(srcM2A, srcP3A);
00585 sum3B = vec_adds(srcM2B, srcP3B);
00586
00587 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00588 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00589
00590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00592
00593 psumA = vec_sub(pp1A, pp2A);
00594 psumB = vec_sub(pp1B, pp2B);
00595
00596 vec_st(psumA, 0, tmp);
00597 vec_st(psumB, 16, tmp);
00598
00599 src += srcStride;
00600 tmp += tmpStride;
00601 }
00602
00603 tmpM2ssA = vec_ld(0, tmpbis);
00604 tmpM2ssB = vec_ld(16, tmpbis);
00605 tmpbis += tmpStride;
00606 tmpM1ssA = vec_ld(0, tmpbis);
00607 tmpM1ssB = vec_ld(16, tmpbis);
00608 tmpbis += tmpStride;
00609 tmpP0ssA = vec_ld(0, tmpbis);
00610 tmpP0ssB = vec_ld(16, tmpbis);
00611 tmpbis += tmpStride;
00612 tmpP1ssA = vec_ld(0, tmpbis);
00613 tmpP1ssB = vec_ld(16, tmpbis);
00614 tmpbis += tmpStride;
00615 tmpP2ssA = vec_ld(0, tmpbis);
00616 tmpP2ssB = vec_ld(16, tmpbis);
00617 tmpbis += tmpStride;
00618
00619 for (i = 0 ; i < 16 ; i++) {
00620 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00621 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00622
00623 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00624 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00625 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00626 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00627 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00628 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00629
00630 tmpbis += tmpStride;
00631
00632 tmpM2ssA = tmpM1ssA;
00633 tmpM2ssB = tmpM1ssB;
00634 tmpM1ssA = tmpP0ssA;
00635 tmpM1ssB = tmpP0ssB;
00636 tmpP0ssA = tmpP1ssA;
00637 tmpP0ssB = tmpP1ssB;
00638 tmpP1ssA = tmpP2ssA;
00639 tmpP1ssB = tmpP2ssB;
00640 tmpP2ssA = tmpP3ssA;
00641 tmpP2ssB = tmpP3ssB;
00642
00643 pp1Ae = vec_mule(sum1A, v20ss);
00644 pp1Ao = vec_mulo(sum1A, v20ss);
00645 pp1Be = vec_mule(sum1B, v20ss);
00646 pp1Bo = vec_mulo(sum1B, v20ss);
00647
00648 pp2Ae = vec_mule(sum2A, v5ss);
00649 pp2Ao = vec_mulo(sum2A, v5ss);
00650 pp2Be = vec_mule(sum2B, v5ss);
00651 pp2Bo = vec_mulo(sum2B, v5ss);
00652
00653 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00654 pp3Ao = vec_mulo(sum3A, v1ss);
00655 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00656 pp3Bo = vec_mulo(sum3B, v1ss);
00657
00658 pp1cAe = vec_add(pp1Ae, v512si);
00659 pp1cAo = vec_add(pp1Ao, v512si);
00660 pp1cBe = vec_add(pp1Be, v512si);
00661 pp1cBo = vec_add(pp1Bo, v512si);
00662
00663 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00664 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00665 pp32Be = vec_sub(pp3Be, pp2Be);
00666 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00667
00668 sumAe = vec_add(pp1cAe, pp32Ae);
00669 sumAo = vec_add(pp1cAo, pp32Ao);
00670 sumBe = vec_add(pp1cBe, pp32Be);
00671 sumBo = vec_add(pp1cBo, pp32Bo);
00672
00673 ssumAe = vec_sra(sumAe, v10ui);
00674 ssumAo = vec_sra(sumAo, v10ui);
00675 ssumBe = vec_sra(sumBe, v10ui);
00676 ssumBo = vec_sra(sumBo, v10ui);
00677
00678 ssume = vec_packs(ssumAe, ssumBe);
00679 ssumo = vec_packs(ssumAo, ssumBo);
00680
00681 sumv = vec_packsu(ssume, ssumo);
00682 sum = vec_perm(sumv, sumv, mperm);
00683
00684 ASSERT_ALIGNED(dst);
00685 vdst = vec_ld(0, dst);
00686
00687 OP_U8_ALTIVEC(fsum, sum, vdst);
00688
00689 vec_st(fsum, 0, dst);
00690
00691 dst += dstStride;
00692 }
00693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00694 }