00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037 
00038 #include <stdlib.h>                                      
00039 #include <string.h>
00040 #include "config.h"
00041 #if HAVE_ALTIVEC_H
00042 #include <altivec.h>
00043 #endif
00044 #include "libavutil/ppc/types_altivec.h"
00045 #include "libavcodec/dsputil.h"
00046 #include "dsputil_altivec.h"
00047 
00048 #define IDCT_HALF                                       \
00049                                          \
00050     t1 = vec_mradds (a1, vx7, vx1 );                    \
00051     t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));    \
00052     t7 = vec_mradds (a2, vx5, vx3);                     \
00053     t3 = vec_mradds (ma2, vx3, vx5);                    \
00054                                                         \
00055                                          \
00056     t5 = vec_adds (vx0, vx4);                           \
00057     t0 = vec_subs (vx0, vx4);                           \
00058     t2 = vec_mradds (a0, vx6, vx2);                     \
00059     t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));    \
00060     t6 = vec_adds (t8, t3);                             \
00061     t3 = vec_subs (t8, t3);                             \
00062     t8 = vec_subs (t1, t7);                             \
00063     t1 = vec_adds (t1, t7);                             \
00064                                                         \
00065                                          \
00066     t7 = vec_adds (t5, t2);                             \
00067     t2 = vec_subs (t5, t2);                             \
00068     t5 = vec_adds (t0, t4);                             \
00069     t0 = vec_subs (t0, t4);                             \
00070     t4 = vec_subs (t8, t3);                             \
00071     t3 = vec_adds (t8, t3);                             \
00072                                                         \
00073                                          \
00074     vy0 = vec_adds (t7, t1);                            \
00075     vy7 = vec_subs (t7, t1);                            \
00076     vy1 = vec_mradds (c4, t3, t5);                      \
00077     vy6 = vec_mradds (mc4, t3, t5);                     \
00078     vy2 = vec_mradds (c4, t4, t0);                      \
00079     vy5 = vec_mradds (mc4, t4, t0);                     \
00080     vy3 = vec_adds (t2, t6);                            \
00081     vy4 = vec_subs (t2, t6);
00082 
00083 
00084 #define IDCT                                                            \
00085     vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
00086     vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
00087     vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
00088     vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
00089     vec_u16 shift;                                                 \
00090                                                                         \
00091     c4 = vec_splat (constants[0], 0);                                   \
00092     a0 = vec_splat (constants[0], 1);                                   \
00093     a1 = vec_splat (constants[0], 2);                                   \
00094     a2 = vec_splat (constants[0], 3);                                   \
00095     mc4 = vec_splat (constants[0], 4);                                  \
00096     ma2 = vec_splat (constants[0], 5);                                  \
00097     bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3);     \
00098                                                                         \
00099     zero = vec_splat_s16 (0);                                           \
00100     shift = vec_splat_u16 (4);                                          \
00101                                                                         \
00102     vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);    \
00103     vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);    \
00104     vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);    \
00105     vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);    \
00106     vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);    \
00107     vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);    \
00108     vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);    \
00109     vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);    \
00110                                                                         \
00111     IDCT_HALF                                                           \
00112                                                                         \
00113     vx0 = vec_mergeh (vy0, vy4);                                        \
00114     vx1 = vec_mergel (vy0, vy4);                                        \
00115     vx2 = vec_mergeh (vy1, vy5);                                        \
00116     vx3 = vec_mergel (vy1, vy5);                                        \
00117     vx4 = vec_mergeh (vy2, vy6);                                        \
00118     vx5 = vec_mergel (vy2, vy6);                                        \
00119     vx6 = vec_mergeh (vy3, vy7);                                        \
00120     vx7 = vec_mergel (vy3, vy7);                                        \
00121                                                                         \
00122     vy0 = vec_mergeh (vx0, vx4);                                        \
00123     vy1 = vec_mergel (vx0, vx4);                                        \
00124     vy2 = vec_mergeh (vx1, vx5);                                        \
00125     vy3 = vec_mergel (vx1, vx5);                                        \
00126     vy4 = vec_mergeh (vx2, vx6);                                        \
00127     vy5 = vec_mergel (vx2, vx6);                                        \
00128     vy6 = vec_mergeh (vx3, vx7);                                        \
00129     vy7 = vec_mergel (vx3, vx7);                                        \
00130                                                                         \
00131     vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                       \
00132     vx1 = vec_mergel (vy0, vy4);                                        \
00133     vx2 = vec_mergeh (vy1, vy5);                                        \
00134     vx3 = vec_mergel (vy1, vy5);                                        \
00135     vx4 = vec_mergeh (vy2, vy6);                                        \
00136     vx5 = vec_mergel (vy2, vy6);                                        \
00137     vx6 = vec_mergeh (vy3, vy7);                                        \
00138     vx7 = vec_mergel (vy3, vy7);                                        \
00139                                                                         \
00140     IDCT_HALF                                                           \
00141                                                                         \
00142     shift = vec_splat_u16 (6);                                          \
00143     vx0 = vec_sra (vy0, shift);                                         \
00144     vx1 = vec_sra (vy1, shift);                                         \
00145     vx2 = vec_sra (vy2, shift);                                         \
00146     vx3 = vec_sra (vy3, shift);                                         \
00147     vx4 = vec_sra (vy4, shift);                                         \
00148     vx5 = vec_sra (vy5, shift);                                         \
00149     vx6 = vec_sra (vy6, shift);                                         \
00150     vx7 = vec_sra (vy7, shift);
00151 
00152 
00153 static const vec_s16 constants[5] = {
00154     {23170, 13573,  6518, 21895, -23170, -21895,    32,    31},
00155     {16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725},
00156     {22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521},
00157     {21407, 29692, 27969, 25172,  21407,  25172, 27969, 29692},
00158     {19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722}
00159 };
00160 
00161 void ff_idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
00162 {
00163     vec_s16 *block = (vec_s16*)blk;
00164     vec_u8 tmp;
00165 
00166     IDCT
00167 
00168 #define COPY(dest,src)                                          \
00169     tmp = vec_packsu (src, src);                                \
00170     vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);       \
00171     vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00172 
00173     COPY (dest, vx0)    dest += stride;
00174     COPY (dest, vx1)    dest += stride;
00175     COPY (dest, vx2)    dest += stride;
00176     COPY (dest, vx3)    dest += stride;
00177     COPY (dest, vx4)    dest += stride;
00178     COPY (dest, vx5)    dest += stride;
00179     COPY (dest, vx6)    dest += stride;
00180     COPY (dest, vx7)
00181 }
00182 
00183 void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
00184 {
00185     vec_s16 *block = (vec_s16*)blk;
00186     vec_u8 tmp;
00187     vec_s16 tmp2, tmp3;
00188     vec_u8 perm0;
00189     vec_u8 perm1;
00190     vec_u8 p0, p1, p;
00191 
00192     IDCT
00193 
00194     p0 = vec_lvsl (0, dest);
00195     p1 = vec_lvsl (stride, dest);
00196     p = vec_splat_u8 (-1);
00197     perm0 = vec_mergeh (p, p0);
00198     perm1 = vec_mergeh (p, p1);
00199 
00200 #define ADD(dest,src,perm)                                              \
00201                             \
00202     tmp = vec_ld (0, dest);                                             \
00203     tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
00204     tmp3 = vec_adds (tmp2, src);                                        \
00205     tmp = vec_packsu (tmp3, tmp3);                                      \
00206     vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
00207     vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00208 
00209     ADD (dest, vx0, perm0)      dest += stride;
00210     ADD (dest, vx1, perm1)      dest += stride;
00211     ADD (dest, vx2, perm0)      dest += stride;
00212     ADD (dest, vx3, perm1)      dest += stride;
00213     ADD (dest, vx4, perm0)      dest += stride;
00214     ADD (dest, vx5, perm1)      dest += stride;
00215     ADD (dest, vx6, perm0)      dest += stride;
00216     ADD (dest, vx7, perm1)
00217 }