00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include "config.h"
00041 #if HAVE_ALTIVEC_H
00042 #include <altivec.h>
00043 #endif
00044 #include "libavcodec/dsputil.h"
00045 #include "types_altivec.h"
00046 #include "dsputil_ppc.h"
00047 #include "dsputil_altivec.h"
00048
00049 #define IDCT_HALF \
00050 \
00051 t1 = vec_mradds (a1, vx7, vx1 ); \
00052 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
00053 t7 = vec_mradds (a2, vx5, vx3); \
00054 t3 = vec_mradds (ma2, vx3, vx5); \
00055 \
00056 \
00057 t5 = vec_adds (vx0, vx4); \
00058 t0 = vec_subs (vx0, vx4); \
00059 t2 = vec_mradds (a0, vx6, vx2); \
00060 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
00061 t6 = vec_adds (t8, t3); \
00062 t3 = vec_subs (t8, t3); \
00063 t8 = vec_subs (t1, t7); \
00064 t1 = vec_adds (t1, t7); \
00065 \
00066 \
00067 t7 = vec_adds (t5, t2); \
00068 t2 = vec_subs (t5, t2); \
00069 t5 = vec_adds (t0, t4); \
00070 t0 = vec_subs (t0, t4); \
00071 t4 = vec_subs (t8, t3); \
00072 t3 = vec_adds (t8, t3); \
00073 \
00074 \
00075 vy0 = vec_adds (t7, t1); \
00076 vy7 = vec_subs (t7, t1); \
00077 vy1 = vec_mradds (c4, t3, t5); \
00078 vy6 = vec_mradds (mc4, t3, t5); \
00079 vy2 = vec_mradds (c4, t4, t0); \
00080 vy5 = vec_mradds (mc4, t4, t0); \
00081 vy3 = vec_adds (t2, t6); \
00082 vy4 = vec_subs (t2, t6);
00083
00084
00085 #define IDCT \
00086 vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
00087 vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
00088 vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
00089 vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
00090 vec_u16 shift; \
00091 \
00092 c4 = vec_splat (constants[0], 0); \
00093 a0 = vec_splat (constants[0], 1); \
00094 a1 = vec_splat (constants[0], 2); \
00095 a2 = vec_splat (constants[0], 3); \
00096 mc4 = vec_splat (constants[0], 4); \
00097 ma2 = vec_splat (constants[0], 5); \
00098 bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
00099 \
00100 zero = vec_splat_s16 (0); \
00101 shift = vec_splat_u16 (4); \
00102 \
00103 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
00104 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
00105 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
00106 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
00107 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
00108 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
00109 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
00110 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
00111 \
00112 IDCT_HALF \
00113 \
00114 vx0 = vec_mergeh (vy0, vy4); \
00115 vx1 = vec_mergel (vy0, vy4); \
00116 vx2 = vec_mergeh (vy1, vy5); \
00117 vx3 = vec_mergel (vy1, vy5); \
00118 vx4 = vec_mergeh (vy2, vy6); \
00119 vx5 = vec_mergel (vy2, vy6); \
00120 vx6 = vec_mergeh (vy3, vy7); \
00121 vx7 = vec_mergel (vy3, vy7); \
00122 \
00123 vy0 = vec_mergeh (vx0, vx4); \
00124 vy1 = vec_mergel (vx0, vx4); \
00125 vy2 = vec_mergeh (vx1, vx5); \
00126 vy3 = vec_mergel (vx1, vx5); \
00127 vy4 = vec_mergeh (vx2, vx6); \
00128 vy5 = vec_mergel (vx2, vx6); \
00129 vy6 = vec_mergeh (vx3, vx7); \
00130 vy7 = vec_mergel (vx3, vx7); \
00131 \
00132 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
00133 vx1 = vec_mergel (vy0, vy4); \
00134 vx2 = vec_mergeh (vy1, vy5); \
00135 vx3 = vec_mergel (vy1, vy5); \
00136 vx4 = vec_mergeh (vy2, vy6); \
00137 vx5 = vec_mergel (vy2, vy6); \
00138 vx6 = vec_mergeh (vy3, vy7); \
00139 vx7 = vec_mergel (vy3, vy7); \
00140 \
00141 IDCT_HALF \
00142 \
00143 shift = vec_splat_u16 (6); \
00144 vx0 = vec_sra (vy0, shift); \
00145 vx1 = vec_sra (vy1, shift); \
00146 vx2 = vec_sra (vy2, shift); \
00147 vx3 = vec_sra (vy3, shift); \
00148 vx4 = vec_sra (vy4, shift); \
00149 vx5 = vec_sra (vy5, shift); \
00150 vx6 = vec_sra (vy6, shift); \
00151 vx7 = vec_sra (vy7, shift);
00152
00153
00154 static const vec_s16 constants[5] = {
00155 {23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
00156 {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
00157 {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
00158 {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
00159 {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
00160 };
00161
00162 void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
00163 {
00164 POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
00165 vec_s16 *block = (vec_s16*)blk;
00166 vec_u8 tmp;
00167
00168 #if CONFIG_POWERPC_PERF
00169 POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
00170 #endif
00171 IDCT
00172
00173 #define COPY(dest,src) \
00174 tmp = vec_packsu (src, src); \
00175 vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
00176 vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00177
00178 COPY (dest, vx0) dest += stride;
00179 COPY (dest, vx1) dest += stride;
00180 COPY (dest, vx2) dest += stride;
00181 COPY (dest, vx3) dest += stride;
00182 COPY (dest, vx4) dest += stride;
00183 COPY (dest, vx5) dest += stride;
00184 COPY (dest, vx6) dest += stride;
00185 COPY (dest, vx7)
00186
00187 POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
00188 }
00189
00190 void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
00191 {
00192 POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
00193 vec_s16 *block = (vec_s16*)blk;
00194 vec_u8 tmp;
00195 vec_s16 tmp2, tmp3;
00196 vec_u8 perm0;
00197 vec_u8 perm1;
00198 vec_u8 p0, p1, p;
00199
00200 #if CONFIG_POWERPC_PERF
00201 POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
00202 #endif
00203
00204 IDCT
00205
00206 p0 = vec_lvsl (0, dest);
00207 p1 = vec_lvsl (stride, dest);
00208 p = vec_splat_u8 (-1);
00209 perm0 = vec_mergeh (p, p0);
00210 perm1 = vec_mergeh (p, p1);
00211
00212 #define ADD(dest,src,perm) \
00213 \
00214 tmp = vec_ld (0, dest); \
00215 tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
00216 tmp3 = vec_adds (tmp2, src); \
00217 tmp = vec_packsu (tmp3, tmp3); \
00218 vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
00219 vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00220
00221 ADD (dest, vx0, perm0) dest += stride;
00222 ADD (dest, vx1, perm1) dest += stride;
00223 ADD (dest, vx2, perm0) dest += stride;
00224 ADD (dest, vx3, perm1) dest += stride;
00225 ADD (dest, vx4, perm0) dest += stride;
00226 ADD (dest, vx5, perm1) dest += stride;
00227 ADD (dest, vx6, perm0) dest += stride;
00228 ADD (dest, vx7, perm1)
00229
00230 POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
00231 }
00232