00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "libavutil/mem.h"
00022
00023 #ifdef DEBUG
00024 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00025 #else
00026 #define ASSERT_ALIGNED(ptr) ;
00027 #endif
00028
00029
00030
00031 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00032 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00033 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00034 \
00035 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00036 psum = vec_mladd(vB, vsrc1ssH, psum);\
00037 psum = vec_mladd(vC, vsrc2ssH, psum);\
00038 psum = vec_mladd(vD, vsrc3ssH, psum);\
00039 psum = BIAS2(psum);\
00040 psum = vec_sr(psum, v6us);\
00041 \
00042 vdst = vec_ld(0, dst);\
00043 ppsum = (vec_u8)vec_pack(psum, psum);\
00044 vfdst = vec_perm(vdst, ppsum, fperm);\
00045 \
00046 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00047 \
00048 vec_st(fsum, 0, dst);\
00049 \
00050 vsrc0ssH = vsrc2ssH;\
00051 vsrc1ssH = vsrc3ssH;\
00052 \
00053 dst += stride;\
00054 src += stride;
00055
00056 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00057 \
00058 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00059 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00060 \
00061 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00062 psum = vec_mladd(vE, vsrc1ssH, psum);\
00063 psum = vec_sr(psum, v6us);\
00064 \
00065 vdst = vec_ld(0, dst);\
00066 ppsum = (vec_u8)vec_pack(psum, psum);\
00067 vfdst = vec_perm(vdst, ppsum, fperm);\
00068 \
00069 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00070 \
00071 vec_st(fsum, 0, dst);\
00072 \
00073 dst += stride;\
00074 src += stride;
00075
00076 #define noop(a) a
00077 #define add28(a) vec_add(v28ss, a)
00078
00079 #ifdef PREFIX_h264_chroma_mc8_altivec
00080 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00081 int stride, int h, int x, int y) {
00082 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00083 {((8 - x) * (8 - y)),
00084 (( x) * (8 - y)),
00085 ((8 - x) * ( y)),
00086 (( x) * ( y))};
00087 register int i;
00088 vec_u8 fperm;
00089 const vec_s32 vABCD = vec_ld(0, ABCD);
00090 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00091 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00092 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00093 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00094 LOAD_ZERO;
00095 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00096 const vec_u16 v6us = vec_splat_u16(6);
00097 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00098 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00099
00100 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00101 vec_u8 vsrc0uc, vsrc1uc;
00102 vec_s16 vsrc0ssH, vsrc1ssH;
00103 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00104 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00105 vec_u8 vdst, ppsum, vfdst, fsum;
00106
00107 if (((unsigned long)dst) % 16 == 0) {
00108 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00109 0x14, 0x15, 0x16, 0x17,
00110 0x08, 0x09, 0x0A, 0x0B,
00111 0x0C, 0x0D, 0x0E, 0x0F};
00112 } else {
00113 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00114 0x04, 0x05, 0x06, 0x07,
00115 0x18, 0x19, 0x1A, 0x1B,
00116 0x1C, 0x1D, 0x1E, 0x1F};
00117 }
00118
00119 vsrcAuc = vec_ld(0, src);
00120
00121 if (loadSecond)
00122 vsrcBuc = vec_ld(16, src);
00123 vsrcperm0 = vec_lvsl(0, src);
00124 vsrcperm1 = vec_lvsl(1, src);
00125
00126 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00127 if (reallyBadAlign)
00128 vsrc1uc = vsrcBuc;
00129 else
00130 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00131
00132 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00133 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00134
00135 if (ABCD[3]) {
00136 if (!loadSecond) {
00137 for (i = 0 ; i < h ; i++) {
00138 vsrcCuc = vec_ld(stride + 0, src);
00139 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00140 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00141
00142 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00143 }
00144 } else {
00145 vec_u8 vsrcDuc;
00146 for (i = 0 ; i < h ; i++) {
00147 vsrcCuc = vec_ld(stride + 0, src);
00148 vsrcDuc = vec_ld(stride + 16, src);
00149 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00150 if (reallyBadAlign)
00151 vsrc3uc = vsrcDuc;
00152 else
00153 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00154
00155 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00156 }
00157 }
00158 } else {
00159 const vec_s16 vE = vec_add(vB, vC);
00160 if (ABCD[2]) {
00161 if (!loadSecond) {
00162 for (i = 0 ; i < h ; i++) {
00163 vsrcCuc = vec_ld(stride + 0, src);
00164 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00165 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00166
00167 vsrc0uc = vsrc1uc;
00168 }
00169 } else {
00170 vec_u8 vsrcDuc;
00171 for (i = 0 ; i < h ; i++) {
00172 vsrcCuc = vec_ld(stride + 0, src);
00173 vsrcDuc = vec_ld(stride + 15, src);
00174 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00175 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00176
00177 vsrc0uc = vsrc1uc;
00178 }
00179 }
00180 } else {
00181 if (!loadSecond) {
00182 for (i = 0 ; i < h ; i++) {
00183 vsrcCuc = vec_ld(0, src);
00184 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00185 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00186
00187 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00188 }
00189 } else {
00190 vec_u8 vsrcDuc;
00191 for (i = 0 ; i < h ; i++) {
00192 vsrcCuc = vec_ld(0, src);
00193 vsrcDuc = vec_ld(15, src);
00194 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00195 if (reallyBadAlign)
00196 vsrc1uc = vsrcDuc;
00197 else
00198 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00199
00200 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00201 }
00202 }
00203 }
00204 }
00205 }
00206 #endif
00207
00208
00209 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
00210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00211 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00212 {((8 - x) * (8 - y)),
00213 (( x) * (8 - y)),
00214 ((8 - x) * ( y)),
00215 (( x) * ( y))};
00216 register int i;
00217 vec_u8 fperm;
00218 const vec_s32 vABCD = vec_ld(0, ABCD);
00219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00223 LOAD_ZERO;
00224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00225 const vec_u16 v6us = vec_splat_u16(6);
00226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00228
00229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00230 vec_u8 vsrc0uc, vsrc1uc;
00231 vec_s16 vsrc0ssH, vsrc1ssH;
00232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00234 vec_u8 vdst, ppsum, vfdst, fsum;
00235
00236 if (((unsigned long)dst) % 16 == 0) {
00237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00238 0x14, 0x15, 0x16, 0x17,
00239 0x08, 0x09, 0x0A, 0x0B,
00240 0x0C, 0x0D, 0x0E, 0x0F};
00241 } else {
00242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00243 0x04, 0x05, 0x06, 0x07,
00244 0x18, 0x19, 0x1A, 0x1B,
00245 0x1C, 0x1D, 0x1E, 0x1F};
00246 }
00247
00248 vsrcAuc = vec_ld(0, src);
00249
00250 if (loadSecond)
00251 vsrcBuc = vec_ld(16, src);
00252 vsrcperm0 = vec_lvsl(0, src);
00253 vsrcperm1 = vec_lvsl(1, src);
00254
00255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00256 if (reallyBadAlign)
00257 vsrc1uc = vsrcBuc;
00258 else
00259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00260
00261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00263
00264 if (!loadSecond) {
00265 for (i = 0 ; i < h ; i++) {
00266
00267
00268 vsrcCuc = vec_ld(stride + 0, src);
00269
00270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00272
00273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00274 }
00275 } else {
00276 vec_u8 vsrcDuc;
00277 for (i = 0 ; i < h ; i++) {
00278 vsrcCuc = vec_ld(stride + 0, src);
00279 vsrcDuc = vec_ld(stride + 16, src);
00280
00281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00282 if (reallyBadAlign)
00283 vsrc3uc = vsrcDuc;
00284 else
00285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00286
00287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00288 }
00289 }
00290 }
00291 #endif
00292
00293 #undef noop
00294 #undef add28
00295 #undef CHROMA_MC8_ALTIVEC_CORE
00296
00297
00298 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
00299 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00300 register int i;
00301
00302 LOAD_ZERO;
00303 const vec_u8 permM2 = vec_lvsl(-2, src);
00304 const vec_u8 permM1 = vec_lvsl(-1, src);
00305 const vec_u8 permP0 = vec_lvsl(+0, src);
00306 const vec_u8 permP1 = vec_lvsl(+1, src);
00307 const vec_u8 permP2 = vec_lvsl(+2, src);
00308 const vec_u8 permP3 = vec_lvsl(+3, src);
00309 const vec_s16 v5ss = vec_splat_s16(5);
00310 const vec_u16 v5us = vec_splat_u16(5);
00311 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00312 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00313
00314 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00315
00316 register int align = ((((unsigned long)src) - 2) % 16);
00317
00318 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00319 srcP2A, srcP2B, srcP3A, srcP3B,
00320 srcM1A, srcM1B, srcM2A, srcM2B,
00321 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00322 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00323 psumA, psumB, sumA, sumB;
00324
00325 vec_u8 sum, fsum;
00326
00327 for (i = 0 ; i < 16 ; i ++) {
00328 vec_u8 srcR1 = vec_ld(-2, src);
00329 vec_u8 srcR2 = vec_ld(14, src);
00330
00331 switch (align) {
00332 default: {
00333 srcM2 = vec_perm(srcR1, srcR2, permM2);
00334 srcM1 = vec_perm(srcR1, srcR2, permM1);
00335 srcP0 = vec_perm(srcR1, srcR2, permP0);
00336 srcP1 = vec_perm(srcR1, srcR2, permP1);
00337 srcP2 = vec_perm(srcR1, srcR2, permP2);
00338 srcP3 = vec_perm(srcR1, srcR2, permP3);
00339 } break;
00340 case 11: {
00341 srcM2 = vec_perm(srcR1, srcR2, permM2);
00342 srcM1 = vec_perm(srcR1, srcR2, permM1);
00343 srcP0 = vec_perm(srcR1, srcR2, permP0);
00344 srcP1 = vec_perm(srcR1, srcR2, permP1);
00345 srcP2 = vec_perm(srcR1, srcR2, permP2);
00346 srcP3 = srcR2;
00347 } break;
00348 case 12: {
00349 vec_u8 srcR3 = vec_ld(30, src);
00350 srcM2 = vec_perm(srcR1, srcR2, permM2);
00351 srcM1 = vec_perm(srcR1, srcR2, permM1);
00352 srcP0 = vec_perm(srcR1, srcR2, permP0);
00353 srcP1 = vec_perm(srcR1, srcR2, permP1);
00354 srcP2 = srcR2;
00355 srcP3 = vec_perm(srcR2, srcR3, permP3);
00356 } break;
00357 case 13: {
00358 vec_u8 srcR3 = vec_ld(30, src);
00359 srcM2 = vec_perm(srcR1, srcR2, permM2);
00360 srcM1 = vec_perm(srcR1, srcR2, permM1);
00361 srcP0 = vec_perm(srcR1, srcR2, permP0);
00362 srcP1 = srcR2;
00363 srcP2 = vec_perm(srcR2, srcR3, permP2);
00364 srcP3 = vec_perm(srcR2, srcR3, permP3);
00365 } break;
00366 case 14: {
00367 vec_u8 srcR3 = vec_ld(30, src);
00368 srcM2 = vec_perm(srcR1, srcR2, permM2);
00369 srcM1 = vec_perm(srcR1, srcR2, permM1);
00370 srcP0 = srcR2;
00371 srcP1 = vec_perm(srcR2, srcR3, permP1);
00372 srcP2 = vec_perm(srcR2, srcR3, permP2);
00373 srcP3 = vec_perm(srcR2, srcR3, permP3);
00374 } break;
00375 case 15: {
00376 vec_u8 srcR3 = vec_ld(30, src);
00377 srcM2 = vec_perm(srcR1, srcR2, permM2);
00378 srcM1 = srcR2;
00379 srcP0 = vec_perm(srcR2, srcR3, permP0);
00380 srcP1 = vec_perm(srcR2, srcR3, permP1);
00381 srcP2 = vec_perm(srcR2, srcR3, permP2);
00382 srcP3 = vec_perm(srcR2, srcR3, permP3);
00383 } break;
00384 }
00385
00386 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00387 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00388 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00389 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00390
00391 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00392 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00393 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00394 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00395
00396 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00397 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00398 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00399 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00400
00401 sum1A = vec_adds(srcP0A, srcP1A);
00402 sum1B = vec_adds(srcP0B, srcP1B);
00403 sum2A = vec_adds(srcM1A, srcP2A);
00404 sum2B = vec_adds(srcM1B, srcP2B);
00405 sum3A = vec_adds(srcM2A, srcP3A);
00406 sum3B = vec_adds(srcM2B, srcP3B);
00407
00408 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00409 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00410
00411 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00412 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00413
00414 pp3A = vec_add(sum3A, pp1A);
00415 pp3B = vec_add(sum3B, pp1B);
00416
00417 psumA = vec_sub(pp3A, pp2A);
00418 psumB = vec_sub(pp3B, pp2B);
00419
00420 sumA = vec_sra(psumA, v5us);
00421 sumB = vec_sra(psumB, v5us);
00422
00423 sum = vec_packsu(sumA, sumB);
00424
00425 ASSERT_ALIGNED(dst);
00426
00427 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
00428
00429 vec_st(fsum, 0, dst);
00430
00431 src += srcStride;
00432 dst += dstStride;
00433 }
00434 }
00435 #endif
00436
00437
00438 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
00439 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00440 register int i;
00441
00442 LOAD_ZERO;
00443 const vec_u8 perm = vec_lvsl(0, src);
00444 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00445 const vec_u16 v5us = vec_splat_u16(5);
00446 const vec_s16 v5ss = vec_splat_s16(5);
00447 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00448
00449 uint8_t *srcbis = src - (srcStride * 2);
00450
00451 const vec_u8 srcM2a = vec_ld(0, srcbis);
00452 const vec_u8 srcM2b = vec_ld(16, srcbis);
00453 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00454
00455 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00456 const vec_u8 srcM1b = vec_ld(16, srcbis);
00457 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00458
00459 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00460 const vec_u8 srcP0b = vec_ld(16, srcbis);
00461 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00462
00463 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00464 const vec_u8 srcP1b = vec_ld(16, srcbis);
00465 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00466
00467 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00468 const vec_u8 srcP2b = vec_ld(16, srcbis);
00469 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00470
00471
00472 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00473 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00474 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00475 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00476 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00477 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00478 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00479 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00480 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00481 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00482
00483 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00484 psumA, psumB, sumA, sumB,
00485 srcP3ssA, srcP3ssB,
00486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00487
00488 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
00489
00490 for (i = 0 ; i < 16 ; i++) {
00491 srcP3a = vec_ld(0, srcbis += srcStride);
00492 srcP3b = vec_ld(16, srcbis);
00493 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00494 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00495 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00496
00497
00498 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00499 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00500 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00501 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00502 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00503 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00504
00505 srcM2ssA = srcM1ssA;
00506 srcM2ssB = srcM1ssB;
00507 srcM1ssA = srcP0ssA;
00508 srcM1ssB = srcP0ssB;
00509 srcP0ssA = srcP1ssA;
00510 srcP0ssB = srcP1ssB;
00511 srcP1ssA = srcP2ssA;
00512 srcP1ssB = srcP2ssB;
00513 srcP2ssA = srcP3ssA;
00514 srcP2ssB = srcP3ssB;
00515
00516 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00517 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00518
00519 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00520 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00521
00522 pp3A = vec_add(sum3A, pp1A);
00523 pp3B = vec_add(sum3B, pp1B);
00524
00525 psumA = vec_sub(pp3A, pp2A);
00526 psumB = vec_sub(pp3B, pp2B);
00527
00528 sumA = vec_sra(psumA, v5us);
00529 sumB = vec_sra(psumB, v5us);
00530
00531 sum = vec_packsu(sumA, sumB);
00532
00533 ASSERT_ALIGNED(dst);
00534
00535 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
00536
00537 vec_st(fsum, 0, dst);
00538
00539 dst += dstStride;
00540 }
00541 }
00542 #endif
00543
00544
00545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
00546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00547 register int i;
00548 LOAD_ZERO;
00549 const vec_u8 permM2 = vec_lvsl(-2, src);
00550 const vec_u8 permM1 = vec_lvsl(-1, src);
00551 const vec_u8 permP0 = vec_lvsl(+0, src);
00552 const vec_u8 permP1 = vec_lvsl(+1, src);
00553 const vec_u8 permP2 = vec_lvsl(+2, src);
00554 const vec_u8 permP3 = vec_lvsl(+3, src);
00555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00556 const vec_u32 v10ui = vec_splat_u32(10);
00557 const vec_s16 v5ss = vec_splat_s16(5);
00558 const vec_s16 v1ss = vec_splat_s16(1);
00559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00561
00562 register int align = ((((unsigned long)src) - 2) % 16);
00563
00564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00565 srcP2A, srcP2B, srcP3A, srcP3B,
00566 srcM1A, srcM1B, srcM2A, srcM2B,
00567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00568 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00569
00570 const vec_u8 mperm = (const vec_u8)
00571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00573 int16_t *tmpbis = tmp;
00574
00575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00577 tmpP2ssA, tmpP2ssB;
00578
00579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00582 ssumAe, ssumAo, ssumBe, ssumBo;
00583 vec_u8 fsum, sumv, sum;
00584 vec_s16 ssume, ssumo;
00585
00586 src -= (2 * srcStride);
00587 for (i = 0 ; i < 21 ; i ++) {
00588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00589 vec_u8 srcR1 = vec_ld(-2, src);
00590 vec_u8 srcR2 = vec_ld(14, src);
00591
00592 switch (align) {
00593 default: {
00594 srcM2 = vec_perm(srcR1, srcR2, permM2);
00595 srcM1 = vec_perm(srcR1, srcR2, permM1);
00596 srcP0 = vec_perm(srcR1, srcR2, permP0);
00597 srcP1 = vec_perm(srcR1, srcR2, permP1);
00598 srcP2 = vec_perm(srcR1, srcR2, permP2);
00599 srcP3 = vec_perm(srcR1, srcR2, permP3);
00600 } break;
00601 case 11: {
00602 srcM2 = vec_perm(srcR1, srcR2, permM2);
00603 srcM1 = vec_perm(srcR1, srcR2, permM1);
00604 srcP0 = vec_perm(srcR1, srcR2, permP0);
00605 srcP1 = vec_perm(srcR1, srcR2, permP1);
00606 srcP2 = vec_perm(srcR1, srcR2, permP2);
00607 srcP3 = srcR2;
00608 } break;
00609 case 12: {
00610 vec_u8 srcR3 = vec_ld(30, src);
00611 srcM2 = vec_perm(srcR1, srcR2, permM2);
00612 srcM1 = vec_perm(srcR1, srcR2, permM1);
00613 srcP0 = vec_perm(srcR1, srcR2, permP0);
00614 srcP1 = vec_perm(srcR1, srcR2, permP1);
00615 srcP2 = srcR2;
00616 srcP3 = vec_perm(srcR2, srcR3, permP3);
00617 } break;
00618 case 13: {
00619 vec_u8 srcR3 = vec_ld(30, src);
00620 srcM2 = vec_perm(srcR1, srcR2, permM2);
00621 srcM1 = vec_perm(srcR1, srcR2, permM1);
00622 srcP0 = vec_perm(srcR1, srcR2, permP0);
00623 srcP1 = srcR2;
00624 srcP2 = vec_perm(srcR2, srcR3, permP2);
00625 srcP3 = vec_perm(srcR2, srcR3, permP3);
00626 } break;
00627 case 14: {
00628 vec_u8 srcR3 = vec_ld(30, src);
00629 srcM2 = vec_perm(srcR1, srcR2, permM2);
00630 srcM1 = vec_perm(srcR1, srcR2, permM1);
00631 srcP0 = srcR2;
00632 srcP1 = vec_perm(srcR2, srcR3, permP1);
00633 srcP2 = vec_perm(srcR2, srcR3, permP2);
00634 srcP3 = vec_perm(srcR2, srcR3, permP3);
00635 } break;
00636 case 15: {
00637 vec_u8 srcR3 = vec_ld(30, src);
00638 srcM2 = vec_perm(srcR1, srcR2, permM2);
00639 srcM1 = srcR2;
00640 srcP0 = vec_perm(srcR2, srcR3, permP0);
00641 srcP1 = vec_perm(srcR2, srcR3, permP1);
00642 srcP2 = vec_perm(srcR2, srcR3, permP2);
00643 srcP3 = vec_perm(srcR2, srcR3, permP3);
00644 } break;
00645 }
00646
00647 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00648 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00649 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00650 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00651
00652 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00653 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00654 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00655 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00656
00657 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00658 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00659 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00660 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00661
00662 sum1A = vec_adds(srcP0A, srcP1A);
00663 sum1B = vec_adds(srcP0B, srcP1B);
00664 sum2A = vec_adds(srcM1A, srcP2A);
00665 sum2B = vec_adds(srcM1B, srcP2B);
00666 sum3A = vec_adds(srcM2A, srcP3A);
00667 sum3B = vec_adds(srcM2B, srcP3B);
00668
00669 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00670 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00671
00672 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00673 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00674
00675 psumA = vec_sub(pp1A, pp2A);
00676 psumB = vec_sub(pp1B, pp2B);
00677
00678 vec_st(psumA, 0, tmp);
00679 vec_st(psumB, 16, tmp);
00680
00681 src += srcStride;
00682 tmp += tmpStride;
00683 }
00684
00685 tmpM2ssA = vec_ld(0, tmpbis);
00686 tmpM2ssB = vec_ld(16, tmpbis);
00687 tmpbis += tmpStride;
00688 tmpM1ssA = vec_ld(0, tmpbis);
00689 tmpM1ssB = vec_ld(16, tmpbis);
00690 tmpbis += tmpStride;
00691 tmpP0ssA = vec_ld(0, tmpbis);
00692 tmpP0ssB = vec_ld(16, tmpbis);
00693 tmpbis += tmpStride;
00694 tmpP1ssA = vec_ld(0, tmpbis);
00695 tmpP1ssB = vec_ld(16, tmpbis);
00696 tmpbis += tmpStride;
00697 tmpP2ssA = vec_ld(0, tmpbis);
00698 tmpP2ssB = vec_ld(16, tmpbis);
00699 tmpbis += tmpStride;
00700
00701 for (i = 0 ; i < 16 ; i++) {
00702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00704
00705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00711
00712 tmpbis += tmpStride;
00713
00714 tmpM2ssA = tmpM1ssA;
00715 tmpM2ssB = tmpM1ssB;
00716 tmpM1ssA = tmpP0ssA;
00717 tmpM1ssB = tmpP0ssB;
00718 tmpP0ssA = tmpP1ssA;
00719 tmpP0ssB = tmpP1ssB;
00720 tmpP1ssA = tmpP2ssA;
00721 tmpP1ssB = tmpP2ssB;
00722 tmpP2ssA = tmpP3ssA;
00723 tmpP2ssB = tmpP3ssB;
00724
00725 pp1Ae = vec_mule(sum1A, v20ss);
00726 pp1Ao = vec_mulo(sum1A, v20ss);
00727 pp1Be = vec_mule(sum1B, v20ss);
00728 pp1Bo = vec_mulo(sum1B, v20ss);
00729
00730 pp2Ae = vec_mule(sum2A, v5ss);
00731 pp2Ao = vec_mulo(sum2A, v5ss);
00732 pp2Be = vec_mule(sum2B, v5ss);
00733 pp2Bo = vec_mulo(sum2B, v5ss);
00734
00735 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00736 pp3Ao = vec_mulo(sum3A, v1ss);
00737 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00738 pp3Bo = vec_mulo(sum3B, v1ss);
00739
00740 pp1cAe = vec_add(pp1Ae, v512si);
00741 pp1cAo = vec_add(pp1Ao, v512si);
00742 pp1cBe = vec_add(pp1Be, v512si);
00743 pp1cBo = vec_add(pp1Bo, v512si);
00744
00745 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00746 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00747 pp32Be = vec_sub(pp3Be, pp2Be);
00748 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00749
00750 sumAe = vec_add(pp1cAe, pp32Ae);
00751 sumAo = vec_add(pp1cAo, pp32Ao);
00752 sumBe = vec_add(pp1cBe, pp32Be);
00753 sumBo = vec_add(pp1cBo, pp32Bo);
00754
00755 ssumAe = vec_sra(sumAe, v10ui);
00756 ssumAo = vec_sra(sumAo, v10ui);
00757 ssumBe = vec_sra(sumBe, v10ui);
00758 ssumBo = vec_sra(sumBo, v10ui);
00759
00760 ssume = vec_packs(ssumAe, ssumBe);
00761 ssumo = vec_packs(ssumAo, ssumBo);
00762
00763 sumv = vec_packsu(ssume, ssumo);
00764 sum = vec_perm(sumv, sumv, mperm);
00765
00766 ASSERT_ALIGNED(dst);
00767
00768 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
00769
00770 vec_st(fsum, 0, dst);
00771
00772 dst += dstStride;
00773 }
00774 }
00775 #endif