37 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
38 vector
unsigned char perm1 = vec_lvsl(0, pix2);
39 vector
unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
40 vector
unsigned char pix2l, pix2r;
41 vector
unsigned char pix1v, pix2v, pix2iv, avgv,
t5;
42 vector
unsigned int sad;
43 vector
signed int sumdiffs;
46 sad = (vector
unsigned int)vec_splat_u32(0);
47 for (i = 0; i < h; i++) {
51 pix1v = vec_ld( 0, pix1);
52 pix2l = vec_ld( 0, pix2);
53 pix2r = vec_ld(16, pix2);
54 pix2v = vec_perm(pix2l, pix2r, perm1);
55 pix2iv = vec_perm(pix2l, pix2r, perm2);
58 avgv = vec_avg(pix2v, pix2iv);
61 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
64 sad = vec_sum4s(t5, sad);
70 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
71 sumdiffs = vec_splat(sumdiffs, 3);
72 vec_ste(sumdiffs, 0, &s);
81 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
82 vector
unsigned char perm = vec_lvsl(0, pix2);
83 vector
unsigned char pix2l, pix2r;
84 vector
unsigned char pix1v, pix2v, pix3v, avgv,
t5;
85 vector
unsigned int sad;
86 vector
signed int sumdiffs;
87 uint8_t *pix3 = pix2 + line_size;
90 sad = (vector
unsigned int)vec_splat_u32(0);
99 pix2l = vec_ld( 0, pix2);
100 pix2r = vec_ld(15, pix2);
101 pix2v = vec_perm(pix2l, pix2r, perm);
103 for (i = 0; i < h; i++) {
107 pix1v = vec_ld(0, pix1);
109 pix2l = vec_ld( 0, pix3);
110 pix2r = vec_ld(15, pix3);
111 pix3v = vec_perm(pix2l, pix2r, perm);
114 avgv = vec_avg(pix2v, pix3v);
117 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
120 sad = vec_sum4s(t5, sad);
129 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
130 sumdiffs = vec_splat(sumdiffs, 3);
131 vec_ste(sumdiffs, 0, &s);
139 uint8_t *pix3 = pix2 + line_size;
140 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
141 const vector
unsigned short two = (
const vector
unsigned short)vec_splat_u16(2);
142 vector
unsigned char avgv,
t5;
143 vector
unsigned char perm1 = vec_lvsl(0, pix2);
144 vector
unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
145 vector
unsigned char pix2l, pix2r;
146 vector
unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
147 vector
unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
148 vector
unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
149 vector
unsigned short avghv, avglv;
150 vector
unsigned short t1,
t2,
t3,
t4;
151 vector
unsigned int sad;
152 vector
signed int sumdiffs;
154 sad = (vector
unsigned int)vec_splat_u32(0);
165 pix2l = vec_ld( 0, pix2);
166 pix2r = vec_ld(16, pix2);
167 pix2v = vec_perm(pix2l, pix2r, perm1);
168 pix2iv = vec_perm(pix2l, pix2r, perm2);
170 pix2hv = (vector
unsigned short) vec_mergeh(zero, pix2v);
171 pix2lv = (vector
unsigned short) vec_mergel(zero, pix2v);
172 pix2ihv = (vector
unsigned short) vec_mergeh(zero, pix2iv);
173 pix2ilv = (vector
unsigned short) vec_mergel(zero, pix2iv);
174 t1 = vec_add(pix2hv, pix2ihv);
175 t2 = vec_add(pix2lv, pix2ilv);
177 for (i = 0; i < h; i++) {
181 pix1v = vec_ld(0, pix1);
183 pix2l = vec_ld( 0, pix3);
184 pix2r = vec_ld(16, pix3);
185 pix3v = vec_perm(pix2l, pix2r, perm1);
186 pix3iv = vec_perm(pix2l, pix2r, perm2);
195 pix3hv = (vector
unsigned short) vec_mergeh(zero, pix3v);
196 pix3lv = (vector
unsigned short) vec_mergel(zero, pix3v);
197 pix3ihv = (vector
unsigned short) vec_mergeh(zero, pix3iv);
198 pix3ilv = (vector
unsigned short) vec_mergel(zero, pix3iv);
201 t3 = vec_add(pix3hv, pix3ihv);
202 t4 = vec_add(pix3lv, pix3ilv);
204 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
205 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
208 avgv = vec_pack(avghv, avglv);
211 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
214 sad = vec_sum4s(t5, sad);
223 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
224 sumdiffs = vec_splat(sumdiffs, 3);
225 vec_ste(sumdiffs, 0, &s);
234 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
235 vector
unsigned char perm = vec_lvsl(0, pix2);
237 vector
unsigned int sad;
238 vector
signed int sumdiffs;
240 sad = (vector
unsigned int)vec_splat_u32(0);
243 for (i = 0; i < h; i++) {
245 vector
unsigned char pix2l = vec_ld( 0, pix2);
246 vector
unsigned char pix2r = vec_ld(15, pix2);
247 t1 = vec_ld(0, pix1);
248 t2 = vec_perm(pix2l, pix2r, perm);
251 t3 = vec_max(t1, t2);
252 t4 = vec_min(t1, t2);
253 t5 = vec_sub(t3, t4);
256 sad = vec_sum4s(t5, sad);
263 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
264 sumdiffs = vec_splat(sumdiffs, 3);
265 vec_ste(sumdiffs, 0, &s);
274 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
275 const vector
unsigned char permclear = (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
276 vector
unsigned char perm1 = vec_lvsl(0, pix1);
277 vector
unsigned char perm2 = vec_lvsl(0, pix2);
279 vector
unsigned int sad;
280 vector
signed int sumdiffs;
282 sad = (vector
unsigned int)vec_splat_u32(0);
284 for (i = 0; i < h; i++) {
288 vector
unsigned char pix1l = vec_ld( 0, pix1);
289 vector
unsigned char pix1r = vec_ld(15, pix1);
290 vector
unsigned char pix2l = vec_ld( 0, pix2);
291 vector
unsigned char pix2r = vec_ld(15, pix2);
292 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
293 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
296 t3 = vec_max(t1, t2);
297 t4 = vec_min(t1, t2);
298 t5 = vec_sub(t3, t4);
301 sad = vec_sum4s(t5, sad);
308 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
309 sumdiffs = vec_splat(sumdiffs, 3);
310 vec_ste(sumdiffs, 0, &s);
319 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
320 vector
unsigned char perm = vec_lvsl(0, pix);
321 vector
unsigned char pixv;
322 vector
unsigned int sv;
323 vector
signed int sum;
325 sv = (vector
unsigned int)vec_splat_u32(0);
328 for (i = 0; i < 16; i++) {
330 vector
unsigned char pixl = vec_ld( 0, pix);
331 vector
unsigned char pixr = vec_ld(15, pix);
332 pixv = vec_perm(pixl, pixr, perm);
335 sv = vec_msum(pixv, pixv, sv);
340 sum = vec_sums((vector
signed int) sv, (vector
signed int) zero);
341 sum = vec_splat(sum, 3);
356 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
357 const vector
unsigned char permclear = (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
358 vector
unsigned char perm1 = vec_lvsl(0, pix1);
359 vector
unsigned char perm2 = vec_lvsl(0, pix2);
361 vector
unsigned int sum;
362 vector
signed int sumsqr;
364 sum = (vector
unsigned int)vec_splat_u32(0);
366 for (i = 0; i < h; i++) {
370 vector
unsigned char pix1l = vec_ld( 0, pix1);
371 vector
unsigned char pix1r = vec_ld(15, pix1);
372 vector
unsigned char pix2l = vec_ld( 0, pix2);
373 vector
unsigned char pix2r = vec_ld(15, pix2);
374 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
375 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
381 t3 = vec_max(t1, t2);
382 t4 = vec_min(t1, t2);
383 t5 = vec_sub(t3, t4);
386 sum = vec_msum(t5, t5, sum);
393 sumsqr = vec_sums((vector
signed int) sum, (vector
signed int) zero);
394 sumsqr = vec_splat(sumsqr, 3);
395 vec_ste(sumsqr, 0, &s);
409 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
410 vector
unsigned char perm = vec_lvsl(0, pix2);
412 vector
unsigned int sum;
413 vector
signed int sumsqr;
415 sum = (vector
unsigned int)vec_splat_u32(0);
417 for (i = 0; i < h; i++) {
419 vector
unsigned char pix2l = vec_ld( 0, pix2);
420 vector
unsigned char pix2r = vec_ld(15, pix2);
421 t1 = vec_ld(0, pix1);
422 t2 = vec_perm(pix2l, pix2r, perm);
428 t3 = vec_max(t1, t2);
429 t4 = vec_min(t1, t2);
430 t5 = vec_sub(t3, t4);
433 sum = vec_msum(t5, t5, sum);
440 sumsqr = vec_sums((vector
signed int) sum, (vector
signed int) zero);
441 sumsqr = vec_splat(sumsqr, 3);
442 vec_ste(sumsqr, 0, &s);
449 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
450 vector
unsigned char perm = vec_lvsl(0, pix);
451 vector
unsigned char t1;
452 vector
unsigned int sad;
453 vector
signed int sumdiffs;
458 sad = (vector
unsigned int)vec_splat_u32(0);
460 for (i = 0; i < 16; i++) {
462 vector
unsigned char pixl = vec_ld( 0, pix);
463 vector
unsigned char pixr = vec_ld(15, pix);
464 t1 = vec_perm(pixl, pixr, perm);
467 sad = vec_sum4s(t1, sad);
473 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
474 sumdiffs = vec_splat(sumdiffs, 3);
475 vec_ste(sumdiffs, 0, &s);
483 vector
unsigned char perm = vec_lvsl(0, pixels);
484 vector
unsigned char bytes;
485 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
486 vector
signed short shorts;
488 for (i = 0; i < 8; i++) {
492 vector
unsigned char pixl = vec_ld( 0, pixels);
493 vector
unsigned char pixr = vec_ld(15, pixels);
494 bytes = vec_perm(pixl, pixr, perm);
497 shorts = (vector
signed short)vec_mergeh(zero, bytes);
500 vec_st(shorts, i*16, (vector
signed short*)block);
510 vector
unsigned char perm1 = vec_lvsl(0, s1);
511 vector
unsigned char perm2 = vec_lvsl(0, s2);
512 vector
unsigned char bytes, pixl, pixr;
513 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
514 vector
signed short shorts1, shorts2;
516 for (i = 0; i < 4; i++) {
520 pixl = vec_ld( 0, s1);
521 pixr = vec_ld(15, s1);
522 bytes = vec_perm(pixl, pixr, perm1);
525 shorts1 = (vector
signed short)vec_mergeh(zero, bytes);
528 pixl = vec_ld( 0, s2);
529 pixr = vec_ld(15, s2);
530 bytes = vec_perm(pixl, pixr, perm2);
533 shorts2 = (vector
signed short)vec_mergeh(zero, bytes);
536 shorts1 = vec_sub(shorts1, shorts2);
539 vec_st(shorts1, 0, (vector
signed short*)block);
552 pixl = vec_ld( 0, s1);
553 pixr = vec_ld(15, s1);
554 bytes = vec_perm(pixl, pixr, perm1);
557 shorts1 = (vector
signed short)vec_mergeh(zero, bytes);
560 pixl = vec_ld( 0, s2);
561 pixr = vec_ld(15, s2);
562 bytes = vec_perm(pixl, pixr, perm2);
565 shorts2 = (vector
signed short)vec_mergeh(zero, bytes);
568 shorts1 = vec_sub(shorts1, shorts2);
571 vec_st(shorts1, 0, (vector
signed short*)block);
595 register vector
unsigned char vdst, vsrc;
598 for (i = 0 ; (i + 15) < w ; i+=16) {
599 vdst = vec_ld(i, (
unsigned char*)dst);
600 vsrc = vec_ld(i, (
unsigned char*)src);
601 vdst = vec_add(vsrc, vdst);
602 vec_st(vdst, i, (
unsigned char*)dst);
605 for (; (i < w) ; i++) {
613 register vector
unsigned char pixelsv1, pixelsv2;
614 register vector
unsigned char pixelsv1B, pixelsv2B;
615 register vector
unsigned char pixelsv1C, pixelsv2C;
616 register vector
unsigned char pixelsv1D, pixelsv2D;
618 register vector
unsigned char perm = vec_lvsl(0, pixels);
620 register ptrdiff_t line_size_2 = line_size << 1;
621 register ptrdiff_t line_size_3 = line_size + line_size_2;
622 register ptrdiff_t line_size_4 = line_size << 2;
629 for (i = 0; i < h; i += 4) {
630 pixelsv1 = vec_ld( 0, pixels);
631 pixelsv2 = vec_ld(15, pixels);
632 pixelsv1B = vec_ld(line_size, pixels);
633 pixelsv2B = vec_ld(15 + line_size, pixels);
634 pixelsv1C = vec_ld(line_size_2, pixels);
635 pixelsv2C = vec_ld(15 + line_size_2, pixels);
636 pixelsv1D = vec_ld(line_size_3, pixels);
637 pixelsv2D = vec_ld(15 + line_size_3, pixels);
638 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
639 0, (
unsigned char*)block);
640 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
641 line_size, (
unsigned char*)block);
642 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
643 line_size_2, (
unsigned char*)block);
644 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
645 line_size_3, (
unsigned char*)block);
652 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
655 register vector
unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
656 register vector
unsigned char perm = vec_lvsl(0, pixels);
659 for (i = 0; i < h; i++) {
660 pixelsv1 = vec_ld( 0, pixels);
661 pixelsv2 = vec_ld(16,pixels);
662 blockv = vec_ld(0, block);
663 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
664 blockv = vec_avg(blockv,pixelsv);
665 vec_st(blockv, 0, (
unsigned char*)block);
674 register vector
unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
677 for (i = 0; i < h; i++) {
680 int rightside = ((
unsigned long)block & 0x0000000F);
682 blockv = vec_ld(0, block);
683 pixelsv1 = vec_ld( 0, pixels);
684 pixelsv2 = vec_ld(16, pixels);
685 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
688 pixelsv = vec_perm(blockv, pixelsv,
vcprm(0,1,
s0,
s1));
690 pixelsv = vec_perm(blockv, pixelsv,
vcprm(
s0,
s1,2,3));
693 blockv = vec_avg(blockv, pixelsv);
695 vec_st(blockv, 0, block);
706 register vector
unsigned char pixelsv1, pixelsv2, pixelsavg;
707 register vector
unsigned char blockv, temp1, temp2;
708 register vector
unsigned short pixelssum1, pixelssum2, temp3;
709 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
710 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
712 temp1 = vec_ld(0, pixels);
713 temp2 = vec_ld(16, pixels);
714 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
715 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
718 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
720 pixelsv1 = vec_mergeh(vczero, pixelsv1);
721 pixelsv2 = vec_mergeh(vczero, pixelsv2);
722 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
723 (vector
unsigned short)pixelsv2);
724 pixelssum1 = vec_add(pixelssum1, vctwo);
726 for (i = 0; i < h ; i++) {
727 int rightside = ((
unsigned long)block & 0x0000000F);
728 blockv = vec_ld(0, block);
730 temp1 = vec_ld(line_size, pixels);
731 temp2 = vec_ld(line_size + 16, pixels);
732 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
733 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
736 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
739 pixelsv1 = vec_mergeh(vczero, pixelsv1);
740 pixelsv2 = vec_mergeh(vczero, pixelsv2);
741 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
742 (vector
unsigned short)pixelsv2);
743 temp3 = vec_add(pixelssum1, pixelssum2);
744 temp3 = vec_sra(temp3, vctwo);
745 pixelssum1 = vec_add(pixelssum2, vctwo);
746 pixelsavg = vec_packsu(temp3, (vector
unsigned short) vczero);
749 blockv = vec_perm(blockv, pixelsavg,
vcprm(0, 1,
s0,
s1));
751 blockv = vec_perm(blockv, pixelsavg,
vcprm(
s0,
s1, 2, 3));
754 vec_st(blockv, 0, block);
765 register vector
unsigned char pixelsv1, pixelsv2, pixelsavg;
766 register vector
unsigned char blockv, temp1, temp2;
767 register vector
unsigned short pixelssum1, pixelssum2, temp3;
768 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
769 register const vector
unsigned short vcone = (
const vector
unsigned short)vec_splat_u16(1);
770 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
772 temp1 = vec_ld(0, pixels);
773 temp2 = vec_ld(16, pixels);
774 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
775 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
778 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
780 pixelsv1 = vec_mergeh(vczero, pixelsv1);
781 pixelsv2 = vec_mergeh(vczero, pixelsv2);
782 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
783 (vector
unsigned short)pixelsv2);
784 pixelssum1 = vec_add(pixelssum1, vcone);
786 for (i = 0; i < h ; i++) {
787 int rightside = ((
unsigned long)block & 0x0000000F);
788 blockv = vec_ld(0, block);
790 temp1 = vec_ld(line_size, pixels);
791 temp2 = vec_ld(line_size + 16, pixels);
792 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
793 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
796 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
799 pixelsv1 = vec_mergeh(vczero, pixelsv1);
800 pixelsv2 = vec_mergeh(vczero, pixelsv2);
801 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
802 (vector
unsigned short)pixelsv2);
803 temp3 = vec_add(pixelssum1, pixelssum2);
804 temp3 = vec_sra(temp3, vctwo);
805 pixelssum1 = vec_add(pixelssum2, vcone);
806 pixelsavg = vec_packsu(temp3, (vector
unsigned short) vczero);
809 blockv = vec_perm(blockv, pixelsavg,
vcprm(0, 1,
s0,
s1));
811 blockv = vec_perm(blockv, pixelsavg,
vcprm(
s0,
s1, 2, 3));
814 vec_st(blockv, 0, block);
825 register vector
unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
826 register vector
unsigned char blockv, temp1, temp2;
827 register vector
unsigned short temp3, temp4,
828 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
829 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
830 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
832 temp1 = vec_ld(0, pixels);
833 temp2 = vec_ld(16, pixels);
834 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
835 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
838 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
840 pixelsv3 = vec_mergel(vczero, pixelsv1);
841 pixelsv4 = vec_mergel(vczero, pixelsv2);
842 pixelsv1 = vec_mergeh(vczero, pixelsv1);
843 pixelsv2 = vec_mergeh(vczero, pixelsv2);
844 pixelssum3 = vec_add((vector
unsigned short)pixelsv3,
845 (vector
unsigned short)pixelsv4);
846 pixelssum3 = vec_add(pixelssum3, vctwo);
847 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
848 (vector
unsigned short)pixelsv2);
849 pixelssum1 = vec_add(pixelssum1, vctwo);
851 for (i = 0; i < h ; i++) {
852 blockv = vec_ld(0, block);
854 temp1 = vec_ld(line_size, pixels);
855 temp2 = vec_ld(line_size + 16, pixels);
856 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
857 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
860 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
863 pixelsv3 = vec_mergel(vczero, pixelsv1);
864 pixelsv4 = vec_mergel(vczero, pixelsv2);
865 pixelsv1 = vec_mergeh(vczero, pixelsv1);
866 pixelsv2 = vec_mergeh(vczero, pixelsv2);
868 pixelssum4 = vec_add((vector
unsigned short)pixelsv3,
869 (vector
unsigned short)pixelsv4);
870 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
871 (vector
unsigned short)pixelsv2);
872 temp4 = vec_add(pixelssum3, pixelssum4);
873 temp4 = vec_sra(temp4, vctwo);
874 temp3 = vec_add(pixelssum1, pixelssum2);
875 temp3 = vec_sra(temp3, vctwo);
877 pixelssum3 = vec_add(pixelssum4, vctwo);
878 pixelssum1 = vec_add(pixelssum2, vctwo);
880 blockv = vec_packsu(temp3, temp4);
882 vec_st(blockv, 0, block);
893 register vector
unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
894 register vector
unsigned char blockv, temp1, temp2;
895 register vector
unsigned short temp3, temp4,
896 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
897 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
898 register const vector
unsigned short vcone = (
const vector
unsigned short)vec_splat_u16(1);
899 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
901 temp1 = vec_ld(0, pixels);
902 temp2 = vec_ld(16, pixels);
903 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
904 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
907 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
909 pixelsv3 = vec_mergel(vczero, pixelsv1);
910 pixelsv4 = vec_mergel(vczero, pixelsv2);
911 pixelsv1 = vec_mergeh(vczero, pixelsv1);
912 pixelsv2 = vec_mergeh(vczero, pixelsv2);
913 pixelssum3 = vec_add((vector
unsigned short)pixelsv3,
914 (vector
unsigned short)pixelsv4);
915 pixelssum3 = vec_add(pixelssum3, vcone);
916 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
917 (vector
unsigned short)pixelsv2);
918 pixelssum1 = vec_add(pixelssum1, vcone);
920 for (i = 0; i < h ; i++) {
921 blockv = vec_ld(0, block);
923 temp1 = vec_ld(line_size, pixels);
924 temp2 = vec_ld(line_size + 16, pixels);
925 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
926 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
929 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
932 pixelsv3 = vec_mergel(vczero, pixelsv1);
933 pixelsv4 = vec_mergel(vczero, pixelsv2);
934 pixelsv1 = vec_mergeh(vczero, pixelsv1);
935 pixelsv2 = vec_mergeh(vczero, pixelsv2);
937 pixelssum4 = vec_add((vector
unsigned short)pixelsv3,
938 (vector
unsigned short)pixelsv4);
939 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
940 (vector
unsigned short)pixelsv2);
941 temp4 = vec_add(pixelssum3, pixelssum4);
942 temp4 = vec_sra(temp4, vctwo);
943 temp3 = vec_add(pixelssum1, pixelssum2);
944 temp3 = vec_sra(temp3, vctwo);
946 pixelssum3 = vec_add(pixelssum4, vcone);
947 pixelssum1 = vec_add(pixelssum2, vcone);
949 blockv = vec_packsu(temp3, temp4);
951 vec_st(blockv, 0, block);
960 register const vector
unsigned char vzero =
961 (
const vector
unsigned char)vec_splat_u8(0);
962 register vector
signed short temp0, temp1, temp2, temp3, temp4,
965 register const vector
signed short vprod1 =(
const vector
signed short)
966 { 1,-1, 1,-1, 1,-1, 1,-1 };
967 register const vector
signed short vprod2 =(
const vector
signed short)
968 { 1, 1,-1,-1, 1, 1,-1,-1 };
969 register const vector
signed short vprod3 =(
const vector
signed short)
970 { 1, 1, 1, 1,-1,-1,-1,-1 };
971 register const vector
unsigned char perm1 = (
const vector
unsigned char)
972 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
973 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
974 register const vector
unsigned char perm2 = (
const vector
unsigned char)
975 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
976 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
977 register const vector
unsigned char perm3 = (
const vector
unsigned char)
978 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
979 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
981 #define ONEITERBUTTERFLY(i, res) \
983 register vector unsigned char src1, src2, srcO; \
984 register vector unsigned char dst1, dst2, dstO; \
985 register vector signed short srcV, dstV; \
986 register vector signed short but0, but1, but2, op1, op2, op3; \
987 src1 = vec_ld(stride * i, src); \
988 src2 = vec_ld((stride * i) + 15, src); \
989 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
990 dst1 = vec_ld(stride * i, dst); \
991 dst2 = vec_ld((stride * i) + 15, dst); \
992 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
995 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
996 (vector signed char)srcO); \
997 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
998 (vector signed char)dstO); \
1000 but0 = vec_sub(srcV, dstV); \
1001 op1 = vec_perm(but0, but0, perm1); \
1002 but1 = vec_mladd(but0, vprod1, op1); \
1003 op2 = vec_perm(but1, but1, perm2); \
1004 but2 = vec_mladd(but1, vprod2, op2); \
1005 op3 = vec_perm(but2, but2, perm3); \
1006 res = vec_mladd(but2, vprod3, op3); \
1017 #undef ONEITERBUTTERFLY
1019 register vector
signed int vsum;
1020 register vector
signed short line0 = vec_add(temp0, temp1);
1021 register vector
signed short line1 = vec_sub(temp0, temp1);
1022 register vector
signed short line2 = vec_add(temp2, temp3);
1023 register vector
signed short line3 = vec_sub(temp2, temp3);
1024 register vector
signed short line4 = vec_add(temp4, temp5);
1025 register vector
signed short line5 = vec_sub(temp4, temp5);
1026 register vector
signed short line6 = vec_add(temp6, temp7);
1027 register vector
signed short line7 = vec_sub(temp6, temp7);
1029 register vector
signed short line0B = vec_add(line0, line2);
1030 register vector
signed short line2B = vec_sub(line0, line2);
1031 register vector
signed short line1B = vec_add(line1, line3);
1032 register vector
signed short line3B = vec_sub(line1, line3);
1033 register vector
signed short line4B = vec_add(line4, line6);
1034 register vector
signed short line6B = vec_sub(line4, line6);
1035 register vector
signed short line5B = vec_add(line5, line7);
1036 register vector
signed short line7B = vec_sub(line5, line7);
1038 register vector
signed short line0C = vec_add(line0B, line4B);
1039 register vector
signed short line4C = vec_sub(line0B, line4B);
1040 register vector
signed short line1C = vec_add(line1B, line5B);
1041 register vector
signed short line5C = vec_sub(line1B, line5B);
1042 register vector
signed short line2C = vec_add(line2B, line6B);
1043 register vector
signed short line6C = vec_sub(line2B, line6B);
1044 register vector
signed short line3C = vec_add(line3B, line7B);
1045 register vector
signed short line7C = vec_sub(line3B, line7B);
1047 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1048 vsum = vec_sum4s(vec_abs(line1C), vsum);
1049 vsum = vec_sum4s(vec_abs(line2C), vsum);
1050 vsum = vec_sum4s(vec_abs(line3C), vsum);
1051 vsum = vec_sum4s(vec_abs(line4C), vsum);
1052 vsum = vec_sum4s(vec_abs(line5C), vsum);
1053 vsum = vec_sum4s(vec_abs(line6C), vsum);
1054 vsum = vec_sum4s(vec_abs(line7C), vsum);
1055 vsum = vec_sums(vsum, (vector
signed int)vzero);
1056 vsum = vec_splat(vsum, 3);
1057 vec_ste(vsum, 0, &sum);
1083 register vector
signed short
1084 temp0 __asm__ (
"v0"),
1085 temp1 __asm__ (
"v1"),
1086 temp2 __asm__ (
"v2"),
1087 temp3 __asm__ (
"v3"),
1088 temp4 __asm__ (
"v4"),
1089 temp5 __asm__ (
"v5"),
1090 temp6 __asm__ (
"v6"),
1091 temp7 __asm__ (
"v7");
1092 register vector
signed short
1093 temp0S __asm__ (
"v8"),
1094 temp1S __asm__ (
"v9"),
1095 temp2S __asm__ (
"v10"),
1096 temp3S __asm__ (
"v11"),
1097 temp4S __asm__ (
"v12"),
1098 temp5S __asm__ (
"v13"),
1099 temp6S __asm__ (
"v14"),
1100 temp7S __asm__ (
"v15");
1101 register const vector
unsigned char vzero __asm__ (
"v31") =
1102 (
const vector
unsigned char)vec_splat_u8(0);
1104 register const vector
signed short vprod1 __asm__ (
"v16") =
1105 (
const vector
signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
1106 register const vector
signed short vprod2 __asm__ (
"v17") =
1107 (
const vector
signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
1108 register const vector
signed short vprod3 __asm__ (
"v18") =
1109 (
const vector
signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
1110 register const vector
unsigned char perm1 __asm__ (
"v19") =
1111 (
const vector
unsigned char)
1112 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
1113 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
1114 register const vector
unsigned char perm2 __asm__ (
"v20") =
1115 (
const vector
unsigned char)
1116 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
1117 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
1118 register const vector
unsigned char perm3 __asm__ (
"v21") =
1119 (
const vector
unsigned char)
1120 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1121 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
1123 #define ONEITERBUTTERFLY(i, res1, res2) \
1125 register vector unsigned char src1 __asm__ ("v22"), \
1126 src2 __asm__ ("v23"), \
1127 dst1 __asm__ ("v24"), \
1128 dst2 __asm__ ("v25"), \
1129 srcO __asm__ ("v22"), \
1130 dstO __asm__ ("v23"); \
1132 register vector signed short srcV __asm__ ("v24"), \
1133 dstV __asm__ ("v25"), \
1134 srcW __asm__ ("v26"), \
1135 dstW __asm__ ("v27"), \
1136 but0 __asm__ ("v28"), \
1137 but0S __asm__ ("v29"), \
1138 op1 __asm__ ("v30"), \
1139 but1 __asm__ ("v22"), \
1140 op1S __asm__ ("v23"), \
1141 but1S __asm__ ("v24"), \
1142 op2 __asm__ ("v25"), \
1143 but2 __asm__ ("v26"), \
1144 op2S __asm__ ("v27"), \
1145 but2S __asm__ ("v28"), \
1146 op3 __asm__ ("v29"), \
1147 op3S __asm__ ("v30"); \
1149 src1 = vec_ld(stride * i, src); \
1150 src2 = vec_ld((stride * i) + 16, src); \
1151 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1152 dst1 = vec_ld(stride * i, dst); \
1153 dst2 = vec_ld((stride * i) + 16, dst); \
1154 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1156 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1157 (vector signed char)srcO); \
1158 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1159 (vector signed char)dstO); \
1160 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
1161 (vector signed char)srcO); \
1162 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
1163 (vector signed char)dstO); \
1165 but0 = vec_sub(srcV, dstV); \
1166 but0S = vec_sub(srcW, dstW); \
1167 op1 = vec_perm(but0, but0, perm1); \
1168 but1 = vec_mladd(but0, vprod1, op1); \
1169 op1S = vec_perm(but0S, but0S, perm1); \
1170 but1S = vec_mladd(but0S, vprod1, op1S); \
1171 op2 = vec_perm(but1, but1, perm2); \
1172 but2 = vec_mladd(but1, vprod2, op2); \
1173 op2S = vec_perm(but1S, but1S, perm2); \
1174 but2S = vec_mladd(but1S, vprod2, op2S); \
1175 op3 = vec_perm(but2, but2, perm3); \
1176 res1 = vec_mladd(but2, vprod3, op3); \
1177 op3S = vec_perm(but2S, but2S, perm3); \
1178 res2 = vec_mladd(but2S, vprod3, op3S); \
1189 #undef ONEITERBUTTERFLY
1191 register vector
signed int vsum;
1192 register vector
signed short line0S, line1S, line2S, line3S, line4S,
1193 line5S, line6S, line7S, line0BS,line2BS,
1194 line1BS,line3BS,line4BS,line6BS,line5BS,
1195 line7BS,line0CS,line4CS,line1CS,line5CS,
1196 line2CS,line6CS,line3CS,line7CS;
1198 register vector
signed short line0 = vec_add(temp0, temp1);
1199 register vector
signed short line1 = vec_sub(temp0, temp1);
1200 register vector
signed short line2 = vec_add(temp2, temp3);
1201 register vector
signed short line3 = vec_sub(temp2, temp3);
1202 register vector
signed short line4 = vec_add(temp4, temp5);
1203 register vector
signed short line5 = vec_sub(temp4, temp5);
1204 register vector
signed short line6 = vec_add(temp6, temp7);
1205 register vector
signed short line7 = vec_sub(temp6, temp7);
1207 register vector
signed short line0B = vec_add(line0, line2);
1208 register vector
signed short line2B = vec_sub(line0, line2);
1209 register vector
signed short line1B = vec_add(line1, line3);
1210 register vector
signed short line3B = vec_sub(line1, line3);
1211 register vector
signed short line4B = vec_add(line4, line6);
1212 register vector
signed short line6B = vec_sub(line4, line6);
1213 register vector
signed short line5B = vec_add(line5, line7);
1214 register vector
signed short line7B = vec_sub(line5, line7);
1216 register vector
signed short line0C = vec_add(line0B, line4B);
1217 register vector
signed short line4C = vec_sub(line0B, line4B);
1218 register vector
signed short line1C = vec_add(line1B, line5B);
1219 register vector
signed short line5C = vec_sub(line1B, line5B);
1220 register vector
signed short line2C = vec_add(line2B, line6B);
1221 register vector
signed short line6C = vec_sub(line2B, line6B);
1222 register vector
signed short line3C = vec_add(line3B, line7B);
1223 register vector
signed short line7C = vec_sub(line3B, line7B);
1225 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1226 vsum = vec_sum4s(vec_abs(line1C), vsum);
1227 vsum = vec_sum4s(vec_abs(line2C), vsum);
1228 vsum = vec_sum4s(vec_abs(line3C), vsum);
1229 vsum = vec_sum4s(vec_abs(line4C), vsum);
1230 vsum = vec_sum4s(vec_abs(line5C), vsum);
1231 vsum = vec_sum4s(vec_abs(line6C), vsum);
1232 vsum = vec_sum4s(vec_abs(line7C), vsum);
1234 line0S = vec_add(temp0S, temp1S);
1235 line1S = vec_sub(temp0S, temp1S);
1236 line2S = vec_add(temp2S, temp3S);
1237 line3S = vec_sub(temp2S, temp3S);
1238 line4S = vec_add(temp4S, temp5S);
1239 line5S = vec_sub(temp4S, temp5S);
1240 line6S = vec_add(temp6S, temp7S);
1241 line7S = vec_sub(temp6S, temp7S);
1243 line0BS = vec_add(line0S, line2S);
1244 line2BS = vec_sub(line0S, line2S);
1245 line1BS = vec_add(line1S, line3S);
1246 line3BS = vec_sub(line1S, line3S);
1247 line4BS = vec_add(line4S, line6S);
1248 line6BS = vec_sub(line4S, line6S);
1249 line5BS = vec_add(line5S, line7S);
1250 line7BS = vec_sub(line5S, line7S);
1252 line0CS = vec_add(line0BS, line4BS);
1253 line4CS = vec_sub(line0BS, line4BS);
1254 line1CS = vec_add(line1BS, line5BS);
1255 line5CS = vec_sub(line1BS, line5BS);
1256 line2CS = vec_add(line2BS, line6BS);
1257 line6CS = vec_sub(line2BS, line6BS);
1258 line3CS = vec_add(line3BS, line7BS);
1259 line7CS = vec_sub(line3BS, line7BS);
1261 vsum = vec_sum4s(vec_abs(line0CS), vsum);
1262 vsum = vec_sum4s(vec_abs(line1CS), vsum);
1263 vsum = vec_sum4s(vec_abs(line2CS), vsum);
1264 vsum = vec_sum4s(vec_abs(line3CS), vsum);
1265 vsum = vec_sum4s(vec_abs(line4CS), vsum);
1266 vsum = vec_sum4s(vec_abs(line5CS), vsum);
1267 vsum = vec_sum4s(vec_abs(line6CS), vsum);
1268 vsum = vec_sum4s(vec_abs(line7CS), vsum);
1269 vsum = vec_sums(vsum, (vector
signed int)
vzero);
1270 vsum = vec_splat(vsum, 3);
1271 vec_ste(vsum, 0, &sum);
1291 register vector
unsigned char pixelsv1, pixelsv2, pixelsavg;
1292 register vector
unsigned char blockv, temp1, temp2, blocktemp;
1293 register vector
unsigned short pixelssum1, pixelssum2, temp3;
1295 register const vector
unsigned char vczero = (
const vector
unsigned char)
1297 register const vector
unsigned short vctwo = (
const vector
unsigned short)
1300 temp1 = vec_ld(0, pixels);
1301 temp2 = vec_ld(16, pixels);
1302 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1303 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
1306 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1308 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1309 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1310 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
1311 (vector
unsigned short)pixelsv2);
1312 pixelssum1 = vec_add(pixelssum1, vctwo);
1314 for (i = 0; i < h ; i++) {
1315 int rightside = ((
unsigned long)block & 0x0000000F);
1316 blockv = vec_ld(0, block);
1318 temp1 = vec_ld(line_size, pixels);
1319 temp2 = vec_ld(line_size + 16, pixels);
1320 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1321 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
1324 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1327 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1328 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1329 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
1330 (vector
unsigned short)pixelsv2);
1331 temp3 = vec_add(pixelssum1, pixelssum2);
1332 temp3 = vec_sra(temp3, vctwo);
1333 pixelssum1 = vec_add(pixelssum2, vctwo);
1334 pixelsavg = vec_packsu(temp3, (vector
unsigned short) vczero);
1337 blocktemp = vec_perm(blockv, pixelsavg,
vcprm(0, 1,
s0,
s1));
1339 blocktemp = vec_perm(blockv, pixelsavg,
vcprm(
s0,
s1, 2, 3));
1342 blockv = vec_avg(blocktemp, blockv);
1343 vec_st(blockv, 0, block);
1346 pixels += line_size;
1367 if (!high_bit_depth) {