35 #define vzero vec_splat_s32(0)
38 #define GET_LS(a,b,c,s) {\
39 vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
40 ls = vec_perm(a, l2, c);\
43 #define GET_VF(a, b, c,d) {\
44 a = vec_mergeh(c, d);\
45 b = vec_mergel(c, d);\
48 #define GET_LS(a,b,c,s) {\
50 a = vec_vsx_ld(((b) << 1) + 16, s);\
52 #define GET_VF(a, b, c, d) {\
53 a = vec_mergel(d, c);\
54 b = vec_mergeh(d, c);\
58 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
59 vector signed short ls;\
60 GET_LS(l1, x, perm, src);\
61 vector signed int i1 = vec_mule(filter, ls);\
62 vector signed int i2 = vec_mulo(filter, ls);\
63 vector signed int vf1, vf2;\
64 GET_VF(vf1, vf2, i1, i2);\
65 d1 = vec_add(d1, vf1);\
66 d2 = vec_add(d2, vf2);\
70 #define LOAD_FILTER(vf,f) {\
71 vector unsigned char perm0 = vec_lvsl(joffset, f);\
72 vf = vec_ld(joffset, f);\
73 vf = vec_perm(vf, vf, perm0);\
75 #define LOAD_L1(ll1,s,p){\
76 p = vec_lvsl(xoffset, s);\
77 ll1 = vec_ld(xoffset, s);\
80 #define LOAD_FILTER(vf,f) {\
81 vf = vec_vsx_ld(joffset, f);\
83 #define LOAD_L1(ll1,s,p){\
84 ll1 = vec_vsx_ld(xoffset, s);\
88 static void yuv2planeX_16_altivec(
const int16_t *
filter,
int filterSize,
94 vector
signed int vo1, vo2, vo3, vo4;
95 vector
unsigned short vs1, vs2;
96 vector
unsigned char vf;
97 vector
unsigned int altivec_vectorShiftInt19 =
98 vec_add(vec_splat_u32(10), vec_splat_u32(9));
100 for (i = 0; i < 16; i++)
101 val[i] = dither[(x + i + offset) & 7] << 12;
103 vo1 = vec_ld(0,
val);
104 vo2 = vec_ld(16,
val);
105 vo3 = vec_ld(32,
val);
106 vo4 = vec_ld(48,
val);
108 for (j = 0; j < filterSize; j++) {
109 unsigned int joffset=j<<1;
110 unsigned int xoffset=x<<1;
111 vector
unsigned char perm;
112 vector
signed short l1,vLumFilter;
113 LOAD_FILTER(vLumFilter,filter);
114 vLumFilter = vec_splat(vLumFilter, 0);
115 LOAD_L1(l1,src[j],perm);
116 yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
117 yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
120 vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
121 vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
122 vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
123 vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
124 vs1 = vec_packsu(vo1, vo2);
125 vs2 = vec_packsu(vo3, vo4);
126 vf = vec_packsu(vs1, vs2);
131 static inline void yuv2planeX_u(
const int16_t *filter,
int filterSize,
132 const int16_t **src,
uint8_t *dest,
int dstW,
133 const uint8_t *dither,
int offset,
int x)
137 for (i = x; i < dstW; i++) {
138 int t = dither[(i +
offset) & 7] << 12;
139 for (j = 0; j < filterSize; j++)
140 t += src[j][i] * filter[j];
141 dest[i] = av_clip_uint8(t >> 19);
145 static void yuv2planeX_altivec(
const int16_t *filter,
int filterSize,
146 const int16_t **src,
uint8_t *dest,
int dstW,
147 const uint8_t *dither,
int offset)
149 int dst_u = -(uintptr_t)dest & 15;
152 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
154 for (i = dst_u; i < dstW - 15; i += 16)
155 yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
158 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
168 #define GET_VF4(a, vf, f) {\
169 vf = vec_ld(a<< 3, f);\
171 vf = vec_mergel(vf, (vector signed short)vzero);\
173 vf = vec_mergeh(vf, (vector signed short)vzero);\
175 #define FIRST_LOAD(sv, pos, s, per) {\
176 sv = vec_ld(pos, s);\
177 per = vec_lvsl(pos, s);\
179 #define UPDATE_PTR(s0, d0, s1, d1) {\
183 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
184 v1 = vec_ld(pos + a + 16, s);\
185 vf = vec_perm(v0, v1, per);\
187 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
188 if ((((uintptr_t)s + pos) % 16) > 8) {\
189 v1 = vec_ld(pos + a + 16, s);\
191 vf = vec_perm(v0, src_v1, per);\
193 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
194 vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
195 vf = vec_perm(vf0, vf1, per);\
198 #define GET_VF4(a, vf, f) {\
199 vf = (vector signed short)vec_vsx_ld(a << 3, f);\
200 vf = vec_mergeh(vf, (vector signed short)vzero);\
202 #define FIRST_LOAD(sv, pos, s, per) {}
203 #define UPDATE_PTR(s0, d0, s1, d1) {}
204 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
205 vf = vec_vsx_ld(pos + a, s);\
207 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
208 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
209 vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
213 static void hScale_altivec_real(
SwsContext *
c, int16_t *dst,
int dstW,
214 const uint8_t *src,
const int16_t *filter,
215 const int32_t *filterPos,
int filterSize)
220 if (filterSize % 4) {
221 for (i = 0; i < dstW; i++) {
223 register int srcPos = filterPos[i];
224 register int val = 0;
225 for (j = 0; j < filterSize; j++)
226 val += ((
int)src[srcPos + j]) * filter[filterSize * i + j];
227 dst[i] =
FFMIN(val >> 7, (1 << 15) - 1);
230 switch (filterSize) {
232 for (i = 0; i < dstW; i++) {
233 register int srcPos = filterPos[i];
235 vector
unsigned char src_vF = unaligned_load(srcPos, src);
236 vector
signed short src_v, filter_v;
237 vector
signed int val_vEven, val_s;
239 (vector
signed short)(VEC_MERGEH((vector
unsigned char)vzero, src_vF));
241 src_v = vec_mergeh(src_v, (vector
signed short)vzero);
242 GET_VF4(i, filter_v, filter);
243 val_vEven = vec_mule(src_v, filter_v);
244 val_s = vec_sums(val_vEven, vzero);
245 vec_st(val_s, 0, tempo);
246 dst[i] =
FFMIN(tempo[3] >> 7, (1 << 15) - 1);
250 for (i = 0; i < dstW; i++) {
251 register int srcPos = filterPos[i];
252 vector
unsigned char src_vF, src_v0, src_v1;
253 vector
unsigned char permS;
254 vector
signed short src_v, filter_v;
255 vector
signed int val_v, val_s;
256 FIRST_LOAD(src_v0, srcPos, src, permS);
257 LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
259 (vector
signed short)(VEC_MERGEH((vector
unsigned char)vzero, src_vF));
260 filter_v = vec_ld(i << 4, filter);
261 val_v = vec_msums(src_v, filter_v, (vector
signed int)vzero);
262 val_s = vec_sums(val_v, vzero);
263 vec_st(val_s, 0, tempo);
264 dst[i] =
FFMIN(tempo[3] >> 7, (1 << 15) - 1);
269 for (i = 0; i < dstW; i++) {
270 register int srcPos = filterPos[i];
272 vector
unsigned char src_vF = unaligned_load(srcPos, src);
273 vector
signed short src_vA =
274 (vector
signed short)(VEC_MERGEH((vector
unsigned char)vzero, src_vF));
275 vector
signed short src_vB =
276 (vector
signed short)(VEC_MERGEL((vector
unsigned char)vzero, src_vF));
277 vector
signed short filter_v0 = vec_ld(i << 5, filter);
278 vector
signed short filter_v1 = vec_ld((i << 5) + 16, filter);
280 vector
signed int val_acc = vec_msums(src_vA, filter_v0, (vector
signed int)vzero);
281 vector
signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
283 vector
signed int val_s = vec_sums(val_v, vzero);
285 VEC_ST(val_s, 0, tempo);
286 dst[i] =
FFMIN(tempo[3] >> 7, (1 << 15) - 1);
291 for (i = 0; i < dstW; i++) {
292 register int j, offset = i * 2 * filterSize;
293 register int srcPos = filterPos[i];
295 vector
signed int val_s, val_v = (vector
signed int)vzero;
296 vector
signed short filter_v0R;
297 vector
unsigned char permF, src_v0, permS;
298 FIRST_LOAD(filter_v0R, offset, filter, permF);
299 FIRST_LOAD(src_v0, srcPos, src, permS);
301 for (j = 0; j < filterSize - 15; j += 16) {
302 vector
unsigned char src_v1, src_vF;
303 vector
signed short filter_v1R, filter_v2R, filter_v0, filter_v1;
304 LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
305 vector
signed short src_vA =
306 (vector
signed short)(VEC_MERGEH((vector
unsigned char)vzero, src_vF));
307 vector
signed short src_vB =
308 (vector
signed short)(VEC_MERGEL((vector
unsigned char)vzero, src_vF));
309 GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
310 GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
312 vector
signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
313 val_v = vec_msums(src_vB, filter_v1, val_acc);
314 UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
317 if (j < filterSize - 7) {
319 vector
unsigned char src_v1, src_vF;
320 vector
signed short src_v, filter_v1R, filter_v;
321 LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
323 (vector
signed short)(VEC_MERGEH((vector
unsigned char)vzero, src_vF));
324 GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
325 val_v = vec_msums(src_v, filter_v, val_v);
327 val_s = vec_sums(val_v, vzero);
329 VEC_ST(val_s, 0, tempo);
330 dst[i] =
FFMIN(tempo[3] >> 7, (1 << 15) - 1);