37 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
38 vector
unsigned char perm1 = vec_lvsl(0, pix2);
39 vector
unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
40 vector
unsigned char pix2l, pix2r;
41 vector
unsigned char pix1v, pix2v, pix2iv, avgv,
t5;
42 vector
unsigned int sad;
43 vector
signed int sumdiffs;
46 sad = (vector
unsigned int)vec_splat_u32(0);
47 for (i = 0; i < h; i++) {
51 pix1v = vec_ld( 0, pix1);
52 pix2l = vec_ld( 0, pix2);
53 pix2r = vec_ld(16, pix2);
54 pix2v = vec_perm(pix2l, pix2r, perm1);
55 pix2iv = vec_perm(pix2l, pix2r, perm2);
58 avgv = vec_avg(pix2v, pix2iv);
61 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
64 sad = vec_sum4s(t5, sad);
70 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
71 sumdiffs = vec_splat(sumdiffs, 3);
72 vec_ste(sumdiffs, 0, &s);
81 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
82 vector
unsigned char perm = vec_lvsl(0, pix2);
83 vector
unsigned char pix2l, pix2r;
84 vector
unsigned char pix1v, pix2v, pix3v, avgv,
t5;
85 vector
unsigned int sad;
86 vector
signed int sumdiffs;
87 uint8_t *pix3 = pix2 + line_size;
90 sad = (vector
unsigned int)vec_splat_u32(0);
99 pix2l = vec_ld( 0, pix2);
100 pix2r = vec_ld(15, pix2);
101 pix2v = vec_perm(pix2l, pix2r, perm);
103 for (i = 0; i < h; i++) {
107 pix1v = vec_ld(0, pix1);
109 pix2l = vec_ld( 0, pix3);
110 pix2r = vec_ld(15, pix3);
111 pix3v = vec_perm(pix2l, pix2r, perm);
114 avgv = vec_avg(pix2v, pix3v);
117 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
120 sad = vec_sum4s(t5, sad);
129 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
130 sumdiffs = vec_splat(sumdiffs, 3);
131 vec_ste(sumdiffs, 0, &s);
139 uint8_t *pix3 = pix2 + line_size;
140 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
141 const vector
unsigned short two = (
const vector
unsigned short)vec_splat_u16(2);
142 vector
unsigned char avgv,
t5;
143 vector
unsigned char perm1 = vec_lvsl(0, pix2);
144 vector
unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
145 vector
unsigned char pix2l, pix2r;
146 vector
unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
147 vector
unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
148 vector
unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
149 vector
unsigned short avghv, avglv;
150 vector
unsigned short t1,
t2,
t3,
t4;
151 vector
unsigned int sad;
152 vector
signed int sumdiffs;
154 sad = (vector
unsigned int)vec_splat_u32(0);
165 pix2l = vec_ld( 0, pix2);
166 pix2r = vec_ld(16, pix2);
167 pix2v = vec_perm(pix2l, pix2r, perm1);
168 pix2iv = vec_perm(pix2l, pix2r, perm2);
170 pix2hv = (vector
unsigned short) vec_mergeh(zero, pix2v);
171 pix2lv = (vector
unsigned short) vec_mergel(zero, pix2v);
172 pix2ihv = (vector
unsigned short) vec_mergeh(zero, pix2iv);
173 pix2ilv = (vector
unsigned short) vec_mergel(zero, pix2iv);
174 t1 = vec_add(pix2hv, pix2ihv);
175 t2 = vec_add(pix2lv, pix2ilv);
177 for (i = 0; i < h; i++) {
181 pix1v = vec_ld(0, pix1);
183 pix2l = vec_ld( 0, pix3);
184 pix2r = vec_ld(16, pix3);
185 pix3v = vec_perm(pix2l, pix2r, perm1);
186 pix3iv = vec_perm(pix2l, pix2r, perm2);
195 pix3hv = (vector
unsigned short) vec_mergeh(zero, pix3v);
196 pix3lv = (vector
unsigned short) vec_mergel(zero, pix3v);
197 pix3ihv = (vector
unsigned short) vec_mergeh(zero, pix3iv);
198 pix3ilv = (vector
unsigned short) vec_mergel(zero, pix3iv);
201 t3 = vec_add(pix3hv, pix3ihv);
202 t4 = vec_add(pix3lv, pix3ilv);
204 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
205 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
208 avgv = vec_pack(avghv, avglv);
211 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
214 sad = vec_sum4s(t5, sad);
223 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
224 sumdiffs = vec_splat(sumdiffs, 3);
225 vec_ste(sumdiffs, 0, &s);
234 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
235 vector
unsigned char perm = vec_lvsl(0, pix2);
237 vector
unsigned int sad;
238 vector
signed int sumdiffs;
240 sad = (vector
unsigned int)vec_splat_u32(0);
243 for (i = 0; i < h; i++) {
245 vector
unsigned char pix2l = vec_ld( 0, pix2);
246 vector
unsigned char pix2r = vec_ld(15, pix2);
247 t1 = vec_ld(0, pix1);
248 t2 = vec_perm(pix2l, pix2r, perm);
251 t3 = vec_max(t1, t2);
252 t4 = vec_min(t1, t2);
253 t5 = vec_sub(t3, t4);
256 sad = vec_sum4s(t5, sad);
263 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
264 sumdiffs = vec_splat(sumdiffs, 3);
265 vec_ste(sumdiffs, 0, &s);
274 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
275 const vector
unsigned char permclear = (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
276 vector
unsigned char perm1 = vec_lvsl(0, pix1);
277 vector
unsigned char perm2 = vec_lvsl(0, pix2);
279 vector
unsigned int sad;
280 vector
signed int sumdiffs;
282 sad = (vector
unsigned int)vec_splat_u32(0);
284 for (i = 0; i < h; i++) {
288 vector
unsigned char pix1l = vec_ld( 0, pix1);
289 vector
unsigned char pix1r = vec_ld(15, pix1);
290 vector
unsigned char pix2l = vec_ld( 0, pix2);
291 vector
unsigned char pix2r = vec_ld(15, pix2);
292 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
293 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
296 t3 = vec_max(t1, t2);
297 t4 = vec_min(t1, t2);
298 t5 = vec_sub(t3, t4);
301 sad = vec_sum4s(t5, sad);
308 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
309 sumdiffs = vec_splat(sumdiffs, 3);
310 vec_ste(sumdiffs, 0, &s);
319 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
320 vector
unsigned char perm = vec_lvsl(0, pix);
321 vector
unsigned char pixv;
322 vector
unsigned int sv;
323 vector
signed int sum;
325 sv = (vector
unsigned int)vec_splat_u32(0);
328 for (i = 0; i < 16; i++) {
330 vector
unsigned char pixl = vec_ld( 0, pix);
331 vector
unsigned char pixr = vec_ld(15, pix);
332 pixv = vec_perm(pixl, pixr, perm);
335 sv = vec_msum(pixv, pixv, sv);
340 sum = vec_sums((vector
signed int) sv, (vector
signed int) zero);
341 sum = vec_splat(sum, 3);
356 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
357 const vector
unsigned char permclear = (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
358 vector
unsigned char perm1 = vec_lvsl(0, pix1);
359 vector
unsigned char perm2 = vec_lvsl(0, pix2);
361 vector
unsigned int sum;
362 vector
signed int sumsqr;
364 sum = (vector
unsigned int)vec_splat_u32(0);
366 for (i = 0; i < h; i++) {
370 vector
unsigned char pix1l = vec_ld( 0, pix1);
371 vector
unsigned char pix1r = vec_ld(15, pix1);
372 vector
unsigned char pix2l = vec_ld( 0, pix2);
373 vector
unsigned char pix2r = vec_ld(15, pix2);
374 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
375 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
381 t3 = vec_max(t1, t2);
382 t4 = vec_min(t1, t2);
383 t5 = vec_sub(t3, t4);
386 sum = vec_msum(t5, t5, sum);
393 sumsqr = vec_sums((vector
signed int) sum, (vector
signed int) zero);
394 sumsqr = vec_splat(sumsqr, 3);
395 vec_ste(sumsqr, 0, &s);
409 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
410 vector
unsigned char perm = vec_lvsl(0, pix2);
412 vector
unsigned int sum;
413 vector
signed int sumsqr;
415 sum = (vector
unsigned int)vec_splat_u32(0);
417 for (i = 0; i < h; i++) {
419 vector
unsigned char pix2l = vec_ld( 0, pix2);
420 vector
unsigned char pix2r = vec_ld(15, pix2);
421 t1 = vec_ld(0, pix1);
422 t2 = vec_perm(pix2l, pix2r, perm);
428 t3 = vec_max(t1, t2);
429 t4 = vec_min(t1, t2);
430 t5 = vec_sub(t3, t4);
433 sum = vec_msum(t5, t5, sum);
440 sumsqr = vec_sums((vector
signed int) sum, (vector
signed int) zero);
441 sumsqr = vec_splat(sumsqr, 3);
442 vec_ste(sumsqr, 0, &s);
449 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
450 vector
unsigned char perm = vec_lvsl(0, pix);
451 vector
unsigned char t1;
452 vector
unsigned int sad;
453 vector
signed int sumdiffs;
458 sad = (vector
unsigned int)vec_splat_u32(0);
460 for (i = 0; i < 16; i++) {
462 vector
unsigned char pixl = vec_ld( 0, pix);
463 vector
unsigned char pixr = vec_ld(15, pix);
464 t1 = vec_perm(pixl, pixr, perm);
467 sad = vec_sum4s(t1, sad);
473 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
474 sumdiffs = vec_splat(sumdiffs, 3);
475 vec_ste(sumdiffs, 0, &s);
483 vector
unsigned char perm = vec_lvsl(0, pixels);
484 vector
unsigned char bytes;
485 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
486 vector
signed short shorts;
488 for (i = 0; i < 8; i++) {
492 vector
unsigned char pixl = vec_ld( 0, pixels);
493 vector
unsigned char pixr = vec_ld(15, pixels);
494 bytes = vec_perm(pixl, pixr, perm);
497 shorts = (vector
signed short)vec_mergeh(zero, bytes);
500 vec_st(shorts, i*16, (vector
signed short*)block);
510 vector
unsigned char perm1 = vec_lvsl(0, s1);
511 vector
unsigned char perm2 = vec_lvsl(0, s2);
512 vector
unsigned char bytes, pixl, pixr;
513 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
514 vector
signed short shorts1, shorts2;
516 for (i = 0; i < 4; i++) {
520 pixl = vec_ld( 0, s1);
521 pixr = vec_ld(15, s1);
522 bytes = vec_perm(pixl, pixr, perm1);
525 shorts1 = (vector
signed short)vec_mergeh(zero, bytes);
528 pixl = vec_ld( 0, s2);
529 pixr = vec_ld(15, s2);
530 bytes = vec_perm(pixl, pixr, perm2);
533 shorts2 = (vector
signed short)vec_mergeh(zero, bytes);
536 shorts1 = vec_sub(shorts1, shorts2);
539 vec_st(shorts1, 0, (vector
signed short*)block);
552 pixl = vec_ld( 0, s1);
553 pixr = vec_ld(15, s1);
554 bytes = vec_perm(pixl, pixr, perm1);
557 shorts1 = (vector
signed short)vec_mergeh(zero, bytes);
560 pixl = vec_ld( 0, s2);
561 pixr = vec_ld(15, s2);
562 bytes = vec_perm(pixl, pixr, perm2);
565 shorts2 = (vector
signed short)vec_mergeh(zero, bytes);
568 shorts1 = vec_sub(shorts1, shorts2);
571 vec_st(shorts1, 0, (vector
signed short*)block);
595 register vector
unsigned char vdst, vsrc;
598 for (i = 0 ; (i + 15) < w ; i+=16) {
599 vdst = vec_ld(i, (
unsigned char*)dst);
600 vsrc = vec_ld(i, (
unsigned char*)src);
601 vdst = vec_add(vsrc, vdst);
602 vec_st(vdst, i, (
unsigned char*)dst);
605 for (; (i < w) ; i++) {
612 register const vector
unsigned char vzero =
613 (
const vector
unsigned char)vec_splat_u8(0);
614 register vector
signed short temp0, temp1, temp2, temp3, temp4,
617 register const vector
signed short vprod1 =(
const vector
signed short)
618 { 1,-1, 1,-1, 1,-1, 1,-1 };
619 register const vector
signed short vprod2 =(
const vector
signed short)
620 { 1, 1,-1,-1, 1, 1,-1,-1 };
621 register const vector
signed short vprod3 =(
const vector
signed short)
622 { 1, 1, 1, 1,-1,-1,-1,-1 };
623 register const vector
unsigned char perm1 = (
const vector
unsigned char)
624 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
625 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
626 register const vector
unsigned char perm2 = (
const vector
unsigned char)
627 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
628 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
629 register const vector
unsigned char perm3 = (
const vector
unsigned char)
630 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
631 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
633 #define ONEITERBUTTERFLY(i, res) \
635 register vector unsigned char src1, src2, srcO; \
636 register vector unsigned char dst1, dst2, dstO; \
637 register vector signed short srcV, dstV; \
638 register vector signed short but0, but1, but2, op1, op2, op3; \
639 src1 = vec_ld(stride * i, src); \
640 src2 = vec_ld((stride * i) + 15, src); \
641 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
642 dst1 = vec_ld(stride * i, dst); \
643 dst2 = vec_ld((stride * i) + 15, dst); \
644 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
647 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
648 (vector signed char)srcO); \
649 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
650 (vector signed char)dstO); \
652 but0 = vec_sub(srcV, dstV); \
653 op1 = vec_perm(but0, but0, perm1); \
654 but1 = vec_mladd(but0, vprod1, op1); \
655 op2 = vec_perm(but1, but1, perm2); \
656 but2 = vec_mladd(but1, vprod2, op2); \
657 op3 = vec_perm(but2, but2, perm3); \
658 res = vec_mladd(but2, vprod3, op3); \
669 #undef ONEITERBUTTERFLY
671 register vector
signed int vsum;
672 register vector
signed short line0 = vec_add(temp0, temp1);
673 register vector
signed short line1 = vec_sub(temp0, temp1);
674 register vector
signed short line2 = vec_add(temp2, temp3);
675 register vector
signed short line3 = vec_sub(temp2, temp3);
676 register vector
signed short line4 = vec_add(temp4, temp5);
677 register vector
signed short line5 = vec_sub(temp4, temp5);
678 register vector
signed short line6 = vec_add(temp6, temp7);
679 register vector
signed short line7 = vec_sub(temp6, temp7);
681 register vector
signed short line0B = vec_add(line0, line2);
682 register vector
signed short line2B = vec_sub(line0, line2);
683 register vector
signed short line1B = vec_add(line1, line3);
684 register vector
signed short line3B = vec_sub(line1, line3);
685 register vector
signed short line4B = vec_add(line4, line6);
686 register vector
signed short line6B = vec_sub(line4, line6);
687 register vector
signed short line5B = vec_add(line5, line7);
688 register vector
signed short line7B = vec_sub(line5, line7);
690 register vector
signed short line0C = vec_add(line0B, line4B);
691 register vector
signed short line4C = vec_sub(line0B, line4B);
692 register vector
signed short line1C = vec_add(line1B, line5B);
693 register vector
signed short line5C = vec_sub(line1B, line5B);
694 register vector
signed short line2C = vec_add(line2B, line6B);
695 register vector
signed short line6C = vec_sub(line2B, line6B);
696 register vector
signed short line3C = vec_add(line3B, line7B);
697 register vector
signed short line7C = vec_sub(line3B, line7B);
699 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
700 vsum = vec_sum4s(vec_abs(line1C), vsum);
701 vsum = vec_sum4s(vec_abs(line2C), vsum);
702 vsum = vec_sum4s(vec_abs(line3C), vsum);
703 vsum = vec_sum4s(vec_abs(line4C), vsum);
704 vsum = vec_sum4s(vec_abs(line5C), vsum);
705 vsum = vec_sum4s(vec_abs(line6C), vsum);
706 vsum = vec_sum4s(vec_abs(line7C), vsum);
707 vsum = vec_sums(vsum, (vector
signed int)vzero);
708 vsum = vec_splat(vsum, 3);
709 vec_ste(vsum, 0, &sum);
735 register vector
signed short
736 temp0 __asm__ (
"v0"),
737 temp1 __asm__ (
"v1"),
738 temp2 __asm__ (
"v2"),
739 temp3 __asm__ (
"v3"),
740 temp4 __asm__ (
"v4"),
741 temp5 __asm__ (
"v5"),
742 temp6 __asm__ (
"v6"),
743 temp7 __asm__ (
"v7");
744 register vector
signed short
745 temp0S __asm__ (
"v8"),
746 temp1S __asm__ (
"v9"),
747 temp2S __asm__ (
"v10"),
748 temp3S __asm__ (
"v11"),
749 temp4S __asm__ (
"v12"),
750 temp5S __asm__ (
"v13"),
751 temp6S __asm__ (
"v14"),
752 temp7S __asm__ (
"v15");
753 register const vector
unsigned char vzero __asm__ (
"v31") =
754 (
const vector
unsigned char)vec_splat_u8(0);
756 register const vector
signed short vprod1 __asm__ (
"v16") =
757 (
const vector
signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
758 register const vector
signed short vprod2 __asm__ (
"v17") =
759 (
const vector
signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
760 register const vector
signed short vprod3 __asm__ (
"v18") =
761 (
const vector
signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
762 register const vector
unsigned char perm1 __asm__ (
"v19") =
763 (
const vector
unsigned char)
764 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
765 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
766 register const vector
unsigned char perm2 __asm__ (
"v20") =
767 (
const vector
unsigned char)
768 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
769 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
770 register const vector
unsigned char perm3 __asm__ (
"v21") =
771 (
const vector
unsigned char)
772 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
773 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
775 #define ONEITERBUTTERFLY(i, res1, res2) \
777 register vector unsigned char src1 __asm__ ("v22"), \
778 src2 __asm__ ("v23"), \
779 dst1 __asm__ ("v24"), \
780 dst2 __asm__ ("v25"), \
781 srcO __asm__ ("v22"), \
782 dstO __asm__ ("v23"); \
784 register vector signed short srcV __asm__ ("v24"), \
785 dstV __asm__ ("v25"), \
786 srcW __asm__ ("v26"), \
787 dstW __asm__ ("v27"), \
788 but0 __asm__ ("v28"), \
789 but0S __asm__ ("v29"), \
790 op1 __asm__ ("v30"), \
791 but1 __asm__ ("v22"), \
792 op1S __asm__ ("v23"), \
793 but1S __asm__ ("v24"), \
794 op2 __asm__ ("v25"), \
795 but2 __asm__ ("v26"), \
796 op2S __asm__ ("v27"), \
797 but2S __asm__ ("v28"), \
798 op3 __asm__ ("v29"), \
799 op3S __asm__ ("v30"); \
801 src1 = vec_ld(stride * i, src); \
802 src2 = vec_ld((stride * i) + 16, src); \
803 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
804 dst1 = vec_ld(stride * i, dst); \
805 dst2 = vec_ld((stride * i) + 16, dst); \
806 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
808 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
809 (vector signed char)srcO); \
810 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
811 (vector signed char)dstO); \
812 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
813 (vector signed char)srcO); \
814 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
815 (vector signed char)dstO); \
817 but0 = vec_sub(srcV, dstV); \
818 but0S = vec_sub(srcW, dstW); \
819 op1 = vec_perm(but0, but0, perm1); \
820 but1 = vec_mladd(but0, vprod1, op1); \
821 op1S = vec_perm(but0S, but0S, perm1); \
822 but1S = vec_mladd(but0S, vprod1, op1S); \
823 op2 = vec_perm(but1, but1, perm2); \
824 but2 = vec_mladd(but1, vprod2, op2); \
825 op2S = vec_perm(but1S, but1S, perm2); \
826 but2S = vec_mladd(but1S, vprod2, op2S); \
827 op3 = vec_perm(but2, but2, perm3); \
828 res1 = vec_mladd(but2, vprod3, op3); \
829 op3S = vec_perm(but2S, but2S, perm3); \
830 res2 = vec_mladd(but2S, vprod3, op3S); \
841 #undef ONEITERBUTTERFLY
843 register vector
signed int vsum;
844 register vector
signed short line0S, line1S, line2S, line3S, line4S,
845 line5S, line6S, line7S, line0BS,line2BS,
846 line1BS,line3BS,line4BS,line6BS,line5BS,
847 line7BS,line0CS,line4CS,line1CS,line5CS,
848 line2CS,line6CS,line3CS,line7CS;
850 register vector
signed short line0 = vec_add(temp0, temp1);
851 register vector
signed short line1 = vec_sub(temp0, temp1);
852 register vector
signed short line2 = vec_add(temp2, temp3);
853 register vector
signed short line3 = vec_sub(temp2, temp3);
854 register vector
signed short line4 = vec_add(temp4, temp5);
855 register vector
signed short line5 = vec_sub(temp4, temp5);
856 register vector
signed short line6 = vec_add(temp6, temp7);
857 register vector
signed short line7 = vec_sub(temp6, temp7);
859 register vector
signed short line0B = vec_add(line0, line2);
860 register vector
signed short line2B = vec_sub(line0, line2);
861 register vector
signed short line1B = vec_add(line1, line3);
862 register vector
signed short line3B = vec_sub(line1, line3);
863 register vector
signed short line4B = vec_add(line4, line6);
864 register vector
signed short line6B = vec_sub(line4, line6);
865 register vector
signed short line5B = vec_add(line5, line7);
866 register vector
signed short line7B = vec_sub(line5, line7);
868 register vector
signed short line0C = vec_add(line0B, line4B);
869 register vector
signed short line4C = vec_sub(line0B, line4B);
870 register vector
signed short line1C = vec_add(line1B, line5B);
871 register vector
signed short line5C = vec_sub(line1B, line5B);
872 register vector
signed short line2C = vec_add(line2B, line6B);
873 register vector
signed short line6C = vec_sub(line2B, line6B);
874 register vector
signed short line3C = vec_add(line3B, line7B);
875 register vector
signed short line7C = vec_sub(line3B, line7B);
877 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
878 vsum = vec_sum4s(vec_abs(line1C), vsum);
879 vsum = vec_sum4s(vec_abs(line2C), vsum);
880 vsum = vec_sum4s(vec_abs(line3C), vsum);
881 vsum = vec_sum4s(vec_abs(line4C), vsum);
882 vsum = vec_sum4s(vec_abs(line5C), vsum);
883 vsum = vec_sum4s(vec_abs(line6C), vsum);
884 vsum = vec_sum4s(vec_abs(line7C), vsum);
886 line0S = vec_add(temp0S, temp1S);
887 line1S = vec_sub(temp0S, temp1S);
888 line2S = vec_add(temp2S, temp3S);
889 line3S = vec_sub(temp2S, temp3S);
890 line4S = vec_add(temp4S, temp5S);
891 line5S = vec_sub(temp4S, temp5S);
892 line6S = vec_add(temp6S, temp7S);
893 line7S = vec_sub(temp6S, temp7S);
895 line0BS = vec_add(line0S, line2S);
896 line2BS = vec_sub(line0S, line2S);
897 line1BS = vec_add(line1S, line3S);
898 line3BS = vec_sub(line1S, line3S);
899 line4BS = vec_add(line4S, line6S);
900 line6BS = vec_sub(line4S, line6S);
901 line5BS = vec_add(line5S, line7S);
902 line7BS = vec_sub(line5S, line7S);
904 line0CS = vec_add(line0BS, line4BS);
905 line4CS = vec_sub(line0BS, line4BS);
906 line1CS = vec_add(line1BS, line5BS);
907 line5CS = vec_sub(line1BS, line5BS);
908 line2CS = vec_add(line2BS, line6BS);
909 line6CS = vec_sub(line2BS, line6BS);
910 line3CS = vec_add(line3BS, line7BS);
911 line7CS = vec_sub(line3BS, line7BS);
913 vsum = vec_sum4s(vec_abs(line0CS), vsum);
914 vsum = vec_sum4s(vec_abs(line1CS), vsum);
915 vsum = vec_sum4s(vec_abs(line2CS), vsum);
916 vsum = vec_sum4s(vec_abs(line3CS), vsum);
917 vsum = vec_sum4s(vec_abs(line4CS), vsum);
918 vsum = vec_sum4s(vec_abs(line5CS), vsum);
919 vsum = vec_sum4s(vec_abs(line6CS), vsum);
920 vsum = vec_sum4s(vec_abs(line7CS), vsum);
921 vsum = vec_sums(vsum, (vector
signed int)vzero);
922 vsum = vec_splat(vsum, 3);
923 vec_ste(vsum, 0, &sum);
956 if (!high_bit_depth) {