00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include "libavcodec/dsputil.h"
00041
00042 #include "gcc_fixes.h"
00043 #include "types_altivec.h"
00044 #include "dsputil_ppc.h"
00045
00046 #define IDCT_HALF \
00047 \
00048 t1 = vec_mradds (a1, vx7, vx1 ); \
00049 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
00050 t7 = vec_mradds (a2, vx5, vx3); \
00051 t3 = vec_mradds (ma2, vx3, vx5); \
00052 \
00053 \
00054 t5 = vec_adds (vx0, vx4); \
00055 t0 = vec_subs (vx0, vx4); \
00056 t2 = vec_mradds (a0, vx6, vx2); \
00057 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
00058 t6 = vec_adds (t8, t3); \
00059 t3 = vec_subs (t8, t3); \
00060 t8 = vec_subs (t1, t7); \
00061 t1 = vec_adds (t1, t7); \
00062 \
00063 \
00064 t7 = vec_adds (t5, t2); \
00065 t2 = vec_subs (t5, t2); \
00066 t5 = vec_adds (t0, t4); \
00067 t0 = vec_subs (t0, t4); \
00068 t4 = vec_subs (t8, t3); \
00069 t3 = vec_adds (t8, t3); \
00070 \
00071 \
00072 vy0 = vec_adds (t7, t1); \
00073 vy7 = vec_subs (t7, t1); \
00074 vy1 = vec_mradds (c4, t3, t5); \
00075 vy6 = vec_mradds (mc4, t3, t5); \
00076 vy2 = vec_mradds (c4, t4, t0); \
00077 vy5 = vec_mradds (mc4, t4, t0); \
00078 vy3 = vec_adds (t2, t6); \
00079 vy4 = vec_subs (t2, t6);
00080
00081
00082 #define IDCT \
00083 vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
00084 vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
00085 vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
00086 vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
00087 vec_u16 shift; \
00088 \
00089 c4 = vec_splat (constants[0], 0); \
00090 a0 = vec_splat (constants[0], 1); \
00091 a1 = vec_splat (constants[0], 2); \
00092 a2 = vec_splat (constants[0], 3); \
00093 mc4 = vec_splat (constants[0], 4); \
00094 ma2 = vec_splat (constants[0], 5); \
00095 bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
00096 \
00097 zero = vec_splat_s16 (0); \
00098 shift = vec_splat_u16 (4); \
00099 \
00100 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
00101 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
00102 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
00103 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
00104 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
00105 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
00106 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
00107 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
00108 \
00109 IDCT_HALF \
00110 \
00111 vx0 = vec_mergeh (vy0, vy4); \
00112 vx1 = vec_mergel (vy0, vy4); \
00113 vx2 = vec_mergeh (vy1, vy5); \
00114 vx3 = vec_mergel (vy1, vy5); \
00115 vx4 = vec_mergeh (vy2, vy6); \
00116 vx5 = vec_mergel (vy2, vy6); \
00117 vx6 = vec_mergeh (vy3, vy7); \
00118 vx7 = vec_mergel (vy3, vy7); \
00119 \
00120 vy0 = vec_mergeh (vx0, vx4); \
00121 vy1 = vec_mergel (vx0, vx4); \
00122 vy2 = vec_mergeh (vx1, vx5); \
00123 vy3 = vec_mergel (vx1, vx5); \
00124 vy4 = vec_mergeh (vx2, vx6); \
00125 vy5 = vec_mergel (vx2, vx6); \
00126 vy6 = vec_mergeh (vx3, vx7); \
00127 vy7 = vec_mergel (vx3, vx7); \
00128 \
00129 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
00130 vx1 = vec_mergel (vy0, vy4); \
00131 vx2 = vec_mergeh (vy1, vy5); \
00132 vx3 = vec_mergel (vy1, vy5); \
00133 vx4 = vec_mergeh (vy2, vy6); \
00134 vx5 = vec_mergel (vy2, vy6); \
00135 vx6 = vec_mergeh (vy3, vy7); \
00136 vx7 = vec_mergel (vy3, vy7); \
00137 \
00138 IDCT_HALF \
00139 \
00140 shift = vec_splat_u16 (6); \
00141 vx0 = vec_sra (vy0, shift); \
00142 vx1 = vec_sra (vy1, shift); \
00143 vx2 = vec_sra (vy2, shift); \
00144 vx3 = vec_sra (vy3, shift); \
00145 vx4 = vec_sra (vy4, shift); \
00146 vx5 = vec_sra (vy5, shift); \
00147 vx6 = vec_sra (vy6, shift); \
00148 vx7 = vec_sra (vy7, shift);
00149
00150
00151 static const vec_s16 constants[5] = {
00152 {23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
00153 {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
00154 {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
00155 {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
00156 {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
00157 };
00158
00159 void idct_put_altivec(uint8_t* dest, int stride, vec_s16* block)
00160 {
00161 POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
00162 vec_u8 tmp;
00163
00164 #if CONFIG_POWERPC_PERF
00165 POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
00166 #endif
00167 IDCT
00168
00169 #define COPY(dest,src) \
00170 tmp = vec_packsu (src, src); \
00171 vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
00172 vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00173
00174 COPY (dest, vx0) dest += stride;
00175 COPY (dest, vx1) dest += stride;
00176 COPY (dest, vx2) dest += stride;
00177 COPY (dest, vx3) dest += stride;
00178 COPY (dest, vx4) dest += stride;
00179 COPY (dest, vx5) dest += stride;
00180 COPY (dest, vx6) dest += stride;
00181 COPY (dest, vx7)
00182
00183 POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
00184 }
00185
00186 void idct_add_altivec(uint8_t* dest, int stride, vec_s16* block)
00187 {
00188 POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
00189 vec_u8 tmp;
00190 vec_s16 tmp2, tmp3;
00191 vec_u8 perm0;
00192 vec_u8 perm1;
00193 vec_u8 p0, p1, p;
00194
00195 #if CONFIG_POWERPC_PERF
00196 POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
00197 #endif
00198
00199 IDCT
00200
00201 p0 = vec_lvsl (0, dest);
00202 p1 = vec_lvsl (stride, dest);
00203 p = vec_splat_u8 (-1);
00204 perm0 = vec_mergeh (p, p0);
00205 perm1 = vec_mergeh (p, p1);
00206
00207 #define ADD(dest,src,perm) \
00208 \
00209 tmp = vec_ld (0, dest); \
00210 tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
00211 tmp3 = vec_adds (tmp2, src); \
00212 tmp = vec_packsu (tmp3, tmp3); \
00213 vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
00214 vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00215
00216 ADD (dest, vx0, perm0) dest += stride;
00217 ADD (dest, vx1, perm1) dest += stride;
00218 ADD (dest, vx2, perm0) dest += stride;
00219 ADD (dest, vx3, perm1) dest += stride;
00220 ADD (dest, vx4, perm0) dest += stride;
00221 ADD (dest, vx5, perm1) dest += stride;
00222 ADD (dest, vx6, perm0) dest += stride;
00223 ADD (dest, vx7, perm1)
00224
00225 POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
00226 }
00227