24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
26 #define ASSERT_ALIGNED(ptr) ;
31 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
32 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
33 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
35 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
36 psum = vec_mladd(vB, vsrc1ssH, psum);\
37 psum = vec_mladd(vC, vsrc2ssH, psum);\
38 psum = vec_mladd(vD, vsrc3ssH, psum);\
40 psum = vec_sr(psum, v6us);\
42 vdst = vec_ld(0, dst);\
43 ppsum = (vec_u8)vec_pack(psum, psum);\
44 vfdst = vec_perm(vdst, ppsum, fperm);\
46 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
48 vec_st(fsum, 0, dst);\
56 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
58 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
59 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
61 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
62 psum = vec_mladd(vE, vsrc1ssH, psum);\
63 psum = vec_sr(psum, v6us);\
65 vdst = vec_ld(0, dst);\
66 ppsum = (vec_u8)vec_pack(psum, psum);\
67 vfdst = vec_perm(vdst, ppsum, fperm);\
69 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
71 vec_st(fsum, 0, dst);\
77 #define add28(a) vec_add(v28ss, a)
79 #ifdef PREFIX_h264_chroma_mc8_altivec
81 int stride,
int h,
int x,
int y) {
89 const vec_s32 vABCD = vec_ld(0, ABCD);
95 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
96 const vec_u16 v6us = vec_splat_u16(6);
97 register int loadSecond = (((
unsigned long)src) % 16) <= 7 ? 0 : 1;
98 register int reallyBadAlign = (((
unsigned long)src) % 16) == 15 ? 1 : 0;
103 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
104 vec_s16 vsrc2ssH, vsrc3ssH, psum;
105 vec_u8 vdst, ppsum, vfdst, fsum;
107 if (((
unsigned long)dst) % 16 == 0) {
108 fperm = (
vec_u8){0x10, 0x11, 0x12, 0x13,
109 0x14, 0x15, 0x16, 0x17,
110 0x08, 0x09, 0x0A, 0x0B,
111 0x0C, 0x0D, 0x0E, 0x0F};
113 fperm = (
vec_u8){0x00, 0x01, 0x02, 0x03,
114 0x04, 0x05, 0x06, 0x07,
115 0x18, 0x19, 0x1A, 0x1B,
116 0x1C, 0x1D, 0x1E, 0x1F};
119 vsrcAuc = vec_ld(0, src);
122 vsrcBuc = vec_ld(16, src);
123 vsrcperm0 = vec_lvsl(0, src);
124 vsrcperm1 = vec_lvsl(1, src);
126 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
130 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
137 for (i = 0 ; i < h ; i++) {
138 vsrcCuc = vec_ld(stride + 0, src);
139 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
140 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
146 for (i = 0 ; i < h ; i++) {
147 vsrcCuc = vec_ld(stride + 0, src);
148 vsrcDuc = vec_ld(stride + 16, src);
149 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
153 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
159 const vec_s16 vE = vec_add(vB, vC);
162 for (i = 0 ; i < h ; i++) {
163 vsrcCuc = vec_ld(stride + 0, src);
164 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
171 for (i = 0 ; i < h ; i++) {
172 vsrcCuc = vec_ld(stride + 0, src);
173 vsrcDuc = vec_ld(stride + 15, src);
174 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
182 for (i = 0 ; i < h ; i++) {
183 vsrcCuc = vec_ld(0, src);
184 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
185 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
191 for (i = 0 ; i < h ; i++) {
192 vsrcCuc = vec_ld(0, src);
193 vsrcDuc = vec_ld(15, src);
194 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
198 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
209 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
212 {((8 - x) * (8 - y)),
218 const vec_s32 vABCD = vec_ld(0, ABCD);
224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225 const vec_u16 v6us = vec_splat_u16(6);
226 register int loadSecond = (((
unsigned long)src) % 16) <= 7 ? 0 : 1;
227 register int reallyBadAlign = (((
unsigned long)src) % 16) == 15 ? 1 : 0;
232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
234 vec_u8 vdst, ppsum, vfdst, fsum;
236 if (((
unsigned long)dst) % 16 == 0) {
237 fperm = (
vec_u8){0x10, 0x11, 0x12, 0x13,
238 0x14, 0x15, 0x16, 0x17,
239 0x08, 0x09, 0x0A, 0x0B,
240 0x0C, 0x0D, 0x0E, 0x0F};
242 fperm = (
vec_u8){0x00, 0x01, 0x02, 0x03,
243 0x04, 0x05, 0x06, 0x07,
244 0x18, 0x19, 0x1A, 0x1B,
245 0x1C, 0x1D, 0x1E, 0x1F};
248 vsrcAuc = vec_ld(0, src);
251 vsrcBuc = vec_ld(16, src);
252 vsrcperm0 = vec_lvsl(0, src);
253 vsrcperm1 = vec_lvsl(1, src);
255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
265 for (i = 0 ; i < h ; i++) {
268 vsrcCuc = vec_ld(stride + 0, src);
270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
277 for (i = 0 ; i < h ; i++) {
278 vsrcCuc = vec_ld(stride + 0, src);
279 vsrcDuc = vec_ld(stride + 16, src);
281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
295 #undef CHROMA_MC8_ALTIVEC_CORE
298 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
303 const vec_u8 permM2 = vec_lvsl(-2, src);
304 const vec_u8 permM1 = vec_lvsl(-1, src);
305 const vec_u8 permP0 = vec_lvsl(+0, src);
306 const vec_u8 permP1 = vec_lvsl(+1, src);
307 const vec_u8 permP2 = vec_lvsl(+2, src);
308 const vec_u8 permP3 = vec_lvsl(+3, src);
309 const vec_s16 v5ss = vec_splat_s16(5);
310 const vec_u16 v5us = vec_splat_u16(5);
311 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
312 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
314 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
316 register int align = ((((
unsigned long)src) - 2) % 16);
318 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319 srcP2A, srcP2B, srcP3A, srcP3B,
320 srcM1A, srcM1B, srcM2A, srcM2B,
321 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
323 psumA, psumB, sumA, sumB;
327 for (i = 0 ; i < 16 ; i ++) {
328 vec_u8 srcR1 = vec_ld(-2, src);
329 vec_u8 srcR2 = vec_ld(14, src);
333 srcM2 = vec_perm(srcR1, srcR2, permM2);
334 srcM1 = vec_perm(srcR1, srcR2, permM1);
335 srcP0 = vec_perm(srcR1, srcR2, permP0);
336 srcP1 = vec_perm(srcR1, srcR2, permP1);
337 srcP2 = vec_perm(srcR1, srcR2, permP2);
338 srcP3 = vec_perm(srcR1, srcR2, permP3);
341 srcM2 = vec_perm(srcR1, srcR2, permM2);
342 srcM1 = vec_perm(srcR1, srcR2, permM1);
343 srcP0 = vec_perm(srcR1, srcR2, permP0);
344 srcP1 = vec_perm(srcR1, srcR2, permP1);
345 srcP2 = vec_perm(srcR1, srcR2, permP2);
349 vec_u8 srcR3 = vec_ld(30, src);
350 srcM2 = vec_perm(srcR1, srcR2, permM2);
351 srcM1 = vec_perm(srcR1, srcR2, permM1);
352 srcP0 = vec_perm(srcR1, srcR2, permP0);
353 srcP1 = vec_perm(srcR1, srcR2, permP1);
355 srcP3 = vec_perm(srcR2, srcR3, permP3);
358 vec_u8 srcR3 = vec_ld(30, src);
359 srcM2 = vec_perm(srcR1, srcR2, permM2);
360 srcM1 = vec_perm(srcR1, srcR2, permM1);
361 srcP0 = vec_perm(srcR1, srcR2, permP0);
363 srcP2 = vec_perm(srcR2, srcR3, permP2);
364 srcP3 = vec_perm(srcR2, srcR3, permP3);
367 vec_u8 srcR3 = vec_ld(30, src);
368 srcM2 = vec_perm(srcR1, srcR2, permM2);
369 srcM1 = vec_perm(srcR1, srcR2, permM1);
371 srcP1 = vec_perm(srcR2, srcR3, permP1);
372 srcP2 = vec_perm(srcR2, srcR3, permP2);
373 srcP3 = vec_perm(srcR2, srcR3, permP3);
376 vec_u8 srcR3 = vec_ld(30, src);
377 srcM2 = vec_perm(srcR1, srcR2, permM2);
379 srcP0 = vec_perm(srcR2, srcR3, permP0);
380 srcP1 = vec_perm(srcR2, srcR3, permP1);
381 srcP2 = vec_perm(srcR2, srcR3, permP2);
382 srcP3 = vec_perm(srcR2, srcR3, permP3);
401 sum1A = vec_adds(srcP0A, srcP1A);
402 sum1B = vec_adds(srcP0B, srcP1B);
403 sum2A = vec_adds(srcM1A, srcP2A);
404 sum2B = vec_adds(srcM1B, srcP2B);
405 sum3A = vec_adds(srcM2A, srcP3A);
406 sum3B = vec_adds(srcM2B, srcP3B);
408 pp1A = vec_mladd(sum1A, v20ss, v16ss);
409 pp1B = vec_mladd(sum1B, v20ss, v16ss);
411 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
412 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
414 pp3A = vec_add(sum3A, pp1A);
415 pp3B = vec_add(sum3B, pp1B);
417 psumA = vec_sub(pp3A, pp2A);
418 psumB = vec_sub(pp3B, pp2B);
420 sumA = vec_sra(psumA, v5us);
421 sumB = vec_sra(psumB, v5us);
423 sum = vec_packsu(sumA, sumB);
429 vec_st(fsum, 0, dst);
438 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
443 const vec_u8 perm = vec_lvsl(0, src);
444 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
445 const vec_u16 v5us = vec_splat_u16(5);
446 const vec_s16 v5ss = vec_splat_s16(5);
447 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
449 uint8_t *srcbis = src - (srcStride * 2);
451 const vec_u8 srcM2a = vec_ld(0, srcbis);
452 const vec_u8 srcM2b = vec_ld(16, srcbis);
453 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
455 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
456 const vec_u8 srcM1b = vec_ld(16, srcbis);
457 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
459 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
460 const vec_u8 srcP0b = vec_ld(16, srcbis);
461 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
463 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
464 const vec_u8 srcP1b = vec_ld(16, srcbis);
465 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
467 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
468 const vec_u8 srcP2b = vec_ld(16, srcbis);
469 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
483 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
484 psumA, psumB, sumA, sumB,
486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
488 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
490 for (i = 0 ; i < 16 ; i++) {
491 srcP3a = vec_ld(0, srcbis += srcStride);
492 srcP3b = vec_ld(16, srcbis);
493 srcP3 = vec_perm(srcP3a, srcP3b, perm);
498 sum1A = vec_adds(srcP0ssA, srcP1ssA);
499 sum1B = vec_adds(srcP0ssB, srcP1ssB);
500 sum2A = vec_adds(srcM1ssA, srcP2ssA);
501 sum2B = vec_adds(srcM1ssB, srcP2ssB);
502 sum3A = vec_adds(srcM2ssA, srcP3ssA);
503 sum3B = vec_adds(srcM2ssB, srcP3ssB);
516 pp1A = vec_mladd(sum1A, v20ss, v16ss);
517 pp1B = vec_mladd(sum1B, v20ss, v16ss);
519 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
520 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
522 pp3A = vec_add(sum3A, pp1A);
523 pp3B = vec_add(sum3B, pp1B);
525 psumA = vec_sub(pp3A, pp2A);
526 psumB = vec_sub(pp3B, pp2B);
528 sumA = vec_sra(psumA, v5us);
529 sumB = vec_sra(psumB, v5us);
531 sum = vec_packsu(sumA, sumB);
537 vec_st(fsum, 0, dst);
545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
549 const vec_u8 permM2 = vec_lvsl(-2, src);
550 const vec_u8 permM1 = vec_lvsl(-1, src);
551 const vec_u8 permP0 = vec_lvsl(+0, src);
552 const vec_u8 permP1 = vec_lvsl(+1, src);
553 const vec_u8 permP2 = vec_lvsl(+2, src);
554 const vec_u8 permP3 = vec_lvsl(+3, src);
555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556 const vec_u32 v10ui = vec_splat_u32(10);
557 const vec_s16 v5ss = vec_splat_s16(5);
558 const vec_s16 v1ss = vec_splat_s16(1);
559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
562 register int align = ((((
unsigned long)src) - 2) % 16);
564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565 srcP2A, srcP2B, srcP3A, srcP3B,
566 srcM1A, srcM1B, srcM2A, srcM2B,
567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573 int16_t *tmpbis = tmp;
575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582 ssumAe, ssumAo, ssumBe, ssumBo;
586 src -= (2 * srcStride);
587 for (i = 0 ; i < 21 ; i ++) {
588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589 vec_u8 srcR1 = vec_ld(-2, src);
590 vec_u8 srcR2 = vec_ld(14, src);
594 srcM2 = vec_perm(srcR1, srcR2, permM2);
595 srcM1 = vec_perm(srcR1, srcR2, permM1);
596 srcP0 = vec_perm(srcR1, srcR2, permP0);
597 srcP1 = vec_perm(srcR1, srcR2, permP1);
598 srcP2 = vec_perm(srcR1, srcR2, permP2);
599 srcP3 = vec_perm(srcR1, srcR2, permP3);
602 srcM2 = vec_perm(srcR1, srcR2, permM2);
603 srcM1 = vec_perm(srcR1, srcR2, permM1);
604 srcP0 = vec_perm(srcR1, srcR2, permP0);
605 srcP1 = vec_perm(srcR1, srcR2, permP1);
606 srcP2 = vec_perm(srcR1, srcR2, permP2);
610 vec_u8 srcR3 = vec_ld(30, src);
611 srcM2 = vec_perm(srcR1, srcR2, permM2);
612 srcM1 = vec_perm(srcR1, srcR2, permM1);
613 srcP0 = vec_perm(srcR1, srcR2, permP0);
614 srcP1 = vec_perm(srcR1, srcR2, permP1);
616 srcP3 = vec_perm(srcR2, srcR3, permP3);
619 vec_u8 srcR3 = vec_ld(30, src);
620 srcM2 = vec_perm(srcR1, srcR2, permM2);
621 srcM1 = vec_perm(srcR1, srcR2, permM1);
622 srcP0 = vec_perm(srcR1, srcR2, permP0);
624 srcP2 = vec_perm(srcR2, srcR3, permP2);
625 srcP3 = vec_perm(srcR2, srcR3, permP3);
628 vec_u8 srcR3 = vec_ld(30, src);
629 srcM2 = vec_perm(srcR1, srcR2, permM2);
630 srcM1 = vec_perm(srcR1, srcR2, permM1);
632 srcP1 = vec_perm(srcR2, srcR3, permP1);
633 srcP2 = vec_perm(srcR2, srcR3, permP2);
634 srcP3 = vec_perm(srcR2, srcR3, permP3);
637 vec_u8 srcR3 = vec_ld(30, src);
638 srcM2 = vec_perm(srcR1, srcR2, permM2);
640 srcP0 = vec_perm(srcR2, srcR3, permP0);
641 srcP1 = vec_perm(srcR2, srcR3, permP1);
642 srcP2 = vec_perm(srcR2, srcR3, permP2);
643 srcP3 = vec_perm(srcR2, srcR3, permP3);
662 sum1A = vec_adds(srcP0A, srcP1A);
663 sum1B = vec_adds(srcP0B, srcP1B);
664 sum2A = vec_adds(srcM1A, srcP2A);
665 sum2B = vec_adds(srcM1B, srcP2B);
666 sum3A = vec_adds(srcM2A, srcP3A);
667 sum3B = vec_adds(srcM2B, srcP3B);
669 pp1A = vec_mladd(sum1A, v20ss, sum3A);
670 pp1B = vec_mladd(sum1B, v20ss, sum3B);
672 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
673 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
675 psumA = vec_sub(pp1A, pp2A);
676 psumB = vec_sub(pp1B, pp2B);
678 vec_st(psumA, 0, tmp);
679 vec_st(psumB, 16, tmp);
685 tmpM2ssA = vec_ld(0, tmpbis);
686 tmpM2ssB = vec_ld(16, tmpbis);
688 tmpM1ssA = vec_ld(0, tmpbis);
689 tmpM1ssB = vec_ld(16, tmpbis);
691 tmpP0ssA = vec_ld(0, tmpbis);
692 tmpP0ssB = vec_ld(16, tmpbis);
694 tmpP1ssA = vec_ld(0, tmpbis);
695 tmpP1ssB = vec_ld(16, tmpbis);
697 tmpP2ssA = vec_ld(0, tmpbis);
698 tmpP2ssB = vec_ld(16, tmpbis);
701 for (i = 0 ; i < 16 ; i++) {
702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
725 pp1Ae = vec_mule(sum1A, v20ss);
726 pp1Ao = vec_mulo(sum1A, v20ss);
727 pp1Be = vec_mule(sum1B, v20ss);
728 pp1Bo = vec_mulo(sum1B, v20ss);
730 pp2Ae = vec_mule(sum2A, v5ss);
731 pp2Ao = vec_mulo(sum2A, v5ss);
732 pp2Be = vec_mule(sum2B, v5ss);
733 pp2Bo = vec_mulo(sum2B, v5ss);
735 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
736 pp3Ao = vec_mulo(sum3A, v1ss);
737 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
738 pp3Bo = vec_mulo(sum3B, v1ss);
740 pp1cAe = vec_add(pp1Ae, v512si);
741 pp1cAo = vec_add(pp1Ao, v512si);
742 pp1cBe = vec_add(pp1Be, v512si);
743 pp1cBo = vec_add(pp1Bo, v512si);
745 pp32Ae = vec_sub(pp3Ae, pp2Ae);
746 pp32Ao = vec_sub(pp3Ao, pp2Ao);
747 pp32Be = vec_sub(pp3Be, pp2Be);
748 pp32Bo = vec_sub(pp3Bo, pp2Bo);
750 sumAe = vec_add(pp1cAe, pp32Ae);
751 sumAo = vec_add(pp1cAo, pp32Ao);
752 sumBe = vec_add(pp1cBe, pp32Be);
753 sumBo = vec_add(pp1cBo, pp32Bo);
755 ssumAe = vec_sra(sumAe, v10ui);
756 ssumAo = vec_sra(sumAo, v10ui);
757 ssumBe = vec_sra(sumBe, v10ui);
758 ssumBo = vec_sra(sumBo, v10ui);
760 ssume = vec_packs(ssumAe, ssumBe);
761 ssumo = vec_packs(ssumAo, ssumBo);
763 sumv = vec_packsu(ssume, ssumo);
764 sum = vec_perm(sumv, sumv, mperm);
770 vec_st(fsum, 0, dst);