31 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
33 #define ASSERT_ALIGNED(ptr) ;
37 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
38 vec_u8 srcR1 = vec_ld(-2, s);\
39 vec_u8 srcR2 = vec_ld(14, s);\
42 srcM2 = vec_perm(srcR1, srcR2, pm2);\
43 srcM1 = vec_perm(srcR1, srcR2, pm1);\
44 srcP0 = vec_perm(srcR1, srcR2, pp0);\
45 srcP1 = vec_perm(srcR1, srcR2, pp1);\
46 srcP2 = vec_perm(srcR1, srcR2, pp2);\
47 srcP3 = vec_perm(srcR1, srcR2, pp3);\
50 srcM2 = vec_perm(srcR1, srcR2, pm2);\
51 srcM1 = vec_perm(srcR1, srcR2, pm1);\
52 srcP0 = vec_perm(srcR1, srcR2, pp0);\
53 srcP1 = vec_perm(srcR1, srcR2, pp1);\
54 srcP2 = vec_perm(srcR1, srcR2, pp2);\
58 vec_u8 srcR3 = vec_ld(30, s);\
59 srcM2 = vec_perm(srcR1, srcR2, pm2);\
60 srcM1 = vec_perm(srcR1, srcR2, pm1);\
61 srcP0 = vec_perm(srcR1, srcR2, pp0);\
62 srcP1 = vec_perm(srcR1, srcR2, pp1);\
64 srcP3 = vec_perm(srcR2, srcR3, pp3);\
67 vec_u8 srcR3 = vec_ld(30, s);\
68 srcM2 = vec_perm(srcR1, srcR2, pm2);\
69 srcM1 = vec_perm(srcR1, srcR2, pm1);\
70 srcP0 = vec_perm(srcR1, srcR2, pp0);\
72 srcP2 = vec_perm(srcR2, srcR3, pp2);\
73 srcP3 = vec_perm(srcR2, srcR3, pp3);\
76 vec_u8 srcR3 = vec_ld(30, s);\
77 srcM2 = vec_perm(srcR1, srcR2, pm2);\
78 srcM1 = vec_perm(srcR1, srcR2, pm1);\
80 srcP1 = vec_perm(srcR2, srcR3, pp1);\
81 srcP2 = vec_perm(srcR2, srcR3, pp2);\
82 srcP3 = vec_perm(srcR2, srcR3, pp3);\
85 vec_u8 srcR3 = vec_ld(30, s);\
86 srcM2 = vec_perm(srcR1, srcR2, pm2);\
88 srcP0 = vec_perm(srcR2, srcR3, pp0);\
89 srcP1 = vec_perm(srcR2, srcR3, pp1);\
90 srcP2 = vec_perm(srcR2, srcR3, pp2);\
91 srcP3 = vec_perm(srcR2, srcR3, pp3);\
96 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
97 srcM2 = vec_vsx_ld(-2, s);\
98 srcM1 = vec_vsx_ld(-1, s);\
99 srcP0 = vec_vsx_ld(0, s);\
100 srcP1 = vec_vsx_ld(1, s);\
101 srcP2 = vec_vsx_ld(2, s);\
102 srcP3 = vec_vsx_ld(3, s);\
107 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
108 static void PREFIX_h264_qpel16_h_lowpass_altivec(
uint8_t *dst,
110 int dstStride,
int srcStride)
115 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
116 const vec_s16 v5ss = vec_splat_s16(5);
117 const vec_u16 v5us = vec_splat_u16(5);
118 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
119 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
121 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
123 register int align = ((((
unsigned long)src) - 2) % 16);
125 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
126 srcP2A, srcP2B, srcP3A, srcP3B,
127 srcM1A, srcM1B, srcM2A, srcM2B,
128 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
129 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
130 psumA, psumB, sumA, sumB;
135 permM2 = vec_lvsl(-2, src);
136 permM1 = vec_lvsl(-1, src);
137 permP0 = vec_lvsl(+0, src);
138 permP1 = vec_lvsl(+1, src);
139 permP2 = vec_lvsl(+2, src);
140 permP3 = vec_lvsl(+3, src);
143 for (i = 0 ; i < 16 ; i ++) {
144 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
161 sum1A = vec_adds(srcP0A, srcP1A);
162 sum1B = vec_adds(srcP0B, srcP1B);
163 sum2A = vec_adds(srcM1A, srcP2A);
164 sum2B = vec_adds(srcM1B, srcP2B);
165 sum3A = vec_adds(srcM2A, srcP3A);
166 sum3B = vec_adds(srcM2B, srcP3B);
168 pp1A = vec_mladd(sum1A, v20ss, v16ss);
169 pp1B = vec_mladd(sum1B, v20ss, v16ss);
171 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
172 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
174 pp3A = vec_add(sum3A, pp1A);
175 pp3B = vec_add(sum3B, pp1B);
177 psumA = vec_sub(pp3A, pp2A);
178 psumB = vec_sub(pp3B, pp2B);
180 sumA = vec_sra(psumA, v5us);
181 sumB = vec_sra(psumB, v5us);
183 sum = vec_packsu(sumA, sumB);
187 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
189 vec_st(fsum, 0, dst);
198 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
199 static void PREFIX_h264_qpel16_v_lowpass_altivec(
uint8_t *dst,
201 int dstStride,
int srcStride)
208 perm = vec_lvsl(0, src);
210 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
211 const vec_u16 v5us = vec_splat_u16(5);
212 const vec_s16 v5ss = vec_splat_s16(5);
213 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
215 const uint8_t *srcbis = src - (srcStride * 2);
217 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
219 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
221 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
223 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
225 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
239 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
240 psumA, psumB, sumA, sumB,
242 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
246 for (i = 0 ; i < 16 ; i++) {
247 srcP3 = load_with_perm_vec(0, srcbis, perm);
253 sum1A = vec_adds(srcP0ssA, srcP1ssA);
254 sum1B = vec_adds(srcP0ssB, srcP1ssB);
255 sum2A = vec_adds(srcM1ssA, srcP2ssA);
256 sum2B = vec_adds(srcM1ssB, srcP2ssB);
257 sum3A = vec_adds(srcM2ssA, srcP3ssA);
258 sum3B = vec_adds(srcM2ssB, srcP3ssB);
271 pp1A = vec_mladd(sum1A, v20ss, v16ss);
272 pp1B = vec_mladd(sum1B, v20ss, v16ss);
274 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
275 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
277 pp3A = vec_add(sum3A, pp1A);
278 pp3B = vec_add(sum3B, pp1B);
280 psumA = vec_sub(pp3A, pp2A);
281 psumB = vec_sub(pp3B, pp2B);
283 sumA = vec_sra(psumA, v5us);
284 sumB = vec_sra(psumB, v5us);
286 sum = vec_packsu(sumA, sumB);
290 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
292 vec_st(fsum, 0, dst);
300 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
301 static void PREFIX_h264_qpel16_hv_lowpass_altivec(
uint8_t *dst, int16_t *tmp,
303 int dstStride,
int tmpStride,
308 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310 const vec_u32 v10ui = vec_splat_u32(10);
311 const vec_s16 v5ss = vec_splat_s16(5);
312 const vec_s16 v1ss = vec_splat_s16(1);
313 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
314 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
316 register int align = ((((
unsigned long)src) - 2) % 16);
318 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319 srcP2A, srcP2B, srcP3A, srcP3B,
320 srcM1A, srcM1B, srcM2A, srcM2B,
321 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
325 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
326 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
327 int16_t *tmpbis = tmp;
329 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
330 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
333 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
334 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
335 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
336 ssumAe, ssumAo, ssumBe, ssumBo;
341 permM2 = vec_lvsl(-2, src);
342 permM1 = vec_lvsl(-1, src);
343 permP0 = vec_lvsl(+0, src);
344 permP1 = vec_lvsl(+1, src);
345 permP2 = vec_lvsl(+2, src);
346 permP3 = vec_lvsl(+3, src);
349 src -= (2 * srcStride);
350 for (i = 0 ; i < 21 ; i ++) {
351 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
353 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
370 sum1A = vec_adds(srcP0A, srcP1A);
371 sum1B = vec_adds(srcP0B, srcP1B);
372 sum2A = vec_adds(srcM1A, srcP2A);
373 sum2B = vec_adds(srcM1B, srcP2B);
374 sum3A = vec_adds(srcM2A, srcP3A);
375 sum3B = vec_adds(srcM2B, srcP3B);
377 pp1A = vec_mladd(sum1A, v20ss, sum3A);
378 pp1B = vec_mladd(sum1B, v20ss, sum3B);
380 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
381 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
383 psumA = vec_sub(pp1A, pp2A);
384 psumB = vec_sub(pp1B, pp2B);
386 vec_st(psumA, 0, tmp);
387 vec_st(psumB, 16, tmp);
393 tmpM2ssA = vec_ld(0, tmpbis);
394 tmpM2ssB = vec_ld(16, tmpbis);
396 tmpM1ssA = vec_ld(0, tmpbis);
397 tmpM1ssB = vec_ld(16, tmpbis);
399 tmpP0ssA = vec_ld(0, tmpbis);
400 tmpP0ssB = vec_ld(16, tmpbis);
402 tmpP1ssA = vec_ld(0, tmpbis);
403 tmpP1ssB = vec_ld(16, tmpbis);
405 tmpP2ssA = vec_ld(0, tmpbis);
406 tmpP2ssB = vec_ld(16, tmpbis);
409 for (i = 0 ; i < 16 ; i++) {
410 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
411 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
413 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
414 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
415 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
416 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
417 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
418 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
433 pp1Ae = vec_mule(sum1A, v20ss);
434 pp1Ao = vec_mulo(sum1A, v20ss);
435 pp1Be = vec_mule(sum1B, v20ss);
436 pp1Bo = vec_mulo(sum1B, v20ss);
438 pp2Ae = vec_mule(sum2A, v5ss);
439 pp2Ao = vec_mulo(sum2A, v5ss);
440 pp2Be = vec_mule(sum2B, v5ss);
441 pp2Bo = vec_mulo(sum2B, v5ss);
443 pp3Ao = vec_mulo(sum3A, v1ss);
444 pp3Bo = vec_mulo(sum3B, v1ss);
446 sum3A = (
vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
447 sum3B = (
vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
449 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
450 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
452 pp1cAe = vec_add(pp1Ae, v512si);
453 pp1cAo = vec_add(pp1Ao, v512si);
454 pp1cBe = vec_add(pp1Be, v512si);
455 pp1cBo = vec_add(pp1Bo, v512si);
457 pp32Ae = vec_sub(pp3Ae, pp2Ae);
458 pp32Ao = vec_sub(pp3Ao, pp2Ao);
459 pp32Be = vec_sub(pp3Be, pp2Be);
460 pp32Bo = vec_sub(pp3Bo, pp2Bo);
462 sumAe = vec_add(pp1cAe, pp32Ae);
463 sumAo = vec_add(pp1cAo, pp32Ao);
464 sumBe = vec_add(pp1cBe, pp32Be);
465 sumBo = vec_add(pp1cBo, pp32Bo);
467 ssumAe = vec_sra(sumAe, v10ui);
468 ssumAo = vec_sra(sumAo, v10ui);
469 ssumBe = vec_sra(sumBe, v10ui);
470 ssumBo = vec_sra(sumBo, v10ui);
472 ssume = vec_packs(ssumAe, ssumBe);
473 ssumo = vec_packs(ssumAo, ssumBo);
475 sumv = vec_packsu(ssume, ssumo);
476 sum = vec_perm(sumv, sumv, mperm);
480 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
482 vec_st(fsum, 0, dst);