FFmpeg
h264_altivec_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mem.h"
22 
23 #ifdef DEBUG
24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #else
26 #define ASSERT_ALIGNED(ptr) ;
27 #endif
28 
29 /* this code assume that stride % 16 == 0 */
30 
31 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
32  vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
33  vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
34 \
35  psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
36  psum = vec_mladd(vB, vsrc1ssH, psum);\
37  psum = vec_mladd(vC, vsrc2ssH, psum);\
38  psum = vec_mladd(vD, vsrc3ssH, psum);\
39  psum = BIAS2(psum);\
40  psum = vec_sr(psum, v6us);\
41 \
42  vdst = vec_ld(0, dst);\
43  ppsum = (vec_u8)vec_pack(psum, psum);\
44  vfdst = vec_perm(vdst, ppsum, fperm);\
45 \
46  OP_U8_ALTIVEC(fsum, vfdst, vdst);\
47 \
48  vec_st(fsum, 0, dst);\
49 \
50  vsrc0ssH = vsrc2ssH;\
51  vsrc1ssH = vsrc3ssH;\
52 \
53  dst += stride;\
54  src += stride;
55 
56 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
57 \
58  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
59  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
60 \
61  psum = vec_mladd(vA, vsrc0ssH, v32ss);\
62  psum = vec_mladd(vE, vsrc1ssH, psum);\
63  psum = vec_sr(psum, v6us);\
64 \
65  vdst = vec_ld(0, dst);\
66  ppsum = (vec_u8)vec_pack(psum, psum);\
67  vfdst = vec_perm(vdst, ppsum, fperm);\
68 \
69  OP_U8_ALTIVEC(fsum, vfdst, vdst);\
70 \
71  vec_st(fsum, 0, dst);\
72 \
73  dst += stride;\
74  src += stride;
75 
76 #define noop(a) a
77 #define add28(a) vec_add(v28ss, a)
78 
79 #ifdef PREFIX_h264_chroma_mc8_altivec
81  int stride, int h, int x, int y) {
82  DECLARE_ALIGNED(16, signed int, ABCD)[4] =
83  {((8 - x) * (8 - y)),
84  (( x) * (8 - y)),
85  ((8 - x) * ( y)),
86  (( x) * ( y))};
87  register int i;
88  vec_u8 fperm;
89  const vec_s32 vABCD = vec_ld(0, ABCD);
90  const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
91  const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
92  const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
93  const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
94  LOAD_ZERO;
95  const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
96  const vec_u16 v6us = vec_splat_u16(6);
97  register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
98  register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
99 
100  vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
101  vec_u8 vsrc0uc, vsrc1uc;
102  vec_s16 vsrc0ssH, vsrc1ssH;
103  vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
104  vec_s16 vsrc2ssH, vsrc3ssH, psum;
105  vec_u8 vdst, ppsum, vfdst, fsum;
106 
107  if (((unsigned long)dst) % 16 == 0) {
108  fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
109  0x14, 0x15, 0x16, 0x17,
110  0x08, 0x09, 0x0A, 0x0B,
111  0x0C, 0x0D, 0x0E, 0x0F};
112  } else {
113  fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
114  0x04, 0x05, 0x06, 0x07,
115  0x18, 0x19, 0x1A, 0x1B,
116  0x1C, 0x1D, 0x1E, 0x1F};
117  }
118 
119  vsrcAuc = vec_ld(0, src);
120 
121  if (loadSecond)
122  vsrcBuc = vec_ld(16, src);
123  vsrcperm0 = vec_lvsl(0, src);
124  vsrcperm1 = vec_lvsl(1, src);
125 
126  vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
127  if (reallyBadAlign)
128  vsrc1uc = vsrcBuc;
129  else
130  vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
131 
132  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
133  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
134 
135  if (ABCD[3]) {
136  if (!loadSecond) {// -> !reallyBadAlign
137  for (i = 0 ; i < h ; i++) {
138  vsrcCuc = vec_ld(stride + 0, src);
139  vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
140  vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
141 
143  }
144  } else {
145  vec_u8 vsrcDuc;
146  for (i = 0 ; i < h ; i++) {
147  vsrcCuc = vec_ld(stride + 0, src);
148  vsrcDuc = vec_ld(stride + 16, src);
149  vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
150  if (reallyBadAlign)
151  vsrc3uc = vsrcDuc;
152  else
153  vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
154 
156  }
157  }
158  } else {
159  const vec_s16 vE = vec_add(vB, vC);
160  if (ABCD[2]) { // x == 0 B == 0
161  if (!loadSecond) {// -> !reallyBadAlign
162  for (i = 0 ; i < h ; i++) {
163  vsrcCuc = vec_ld(stride + 0, src);
164  vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
166 
167  vsrc0uc = vsrc1uc;
168  }
169  } else {
170  vec_u8 vsrcDuc;
171  for (i = 0 ; i < h ; i++) {
172  vsrcCuc = vec_ld(stride + 0, src);
173  vsrcDuc = vec_ld(stride + 15, src);
174  vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
176 
177  vsrc0uc = vsrc1uc;
178  }
179  }
180  } else { // y == 0 C == 0
181  if (!loadSecond) {// -> !reallyBadAlign
182  for (i = 0 ; i < h ; i++) {
183  vsrcCuc = vec_ld(0, src);
184  vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
185  vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
186 
188  }
189  } else {
190  vec_u8 vsrcDuc;
191  for (i = 0 ; i < h ; i++) {
192  vsrcCuc = vec_ld(0, src);
193  vsrcDuc = vec_ld(15, src);
194  vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
195  if (reallyBadAlign)
196  vsrc1uc = vsrcDuc;
197  else
198  vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
199 
201  }
202  }
203  }
204  }
205 }
206 #endif
207 
208 /* this code assume that stride % 16 == 0 */
209 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
210 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211  DECLARE_ALIGNED(16, signed int, ABCD)[4] =
212  {((8 - x) * (8 - y)),
213  (( x) * (8 - y)),
214  ((8 - x) * ( y)),
215  (( x) * ( y))};
216  register int i;
217  vec_u8 fperm;
218  const vec_s32 vABCD = vec_ld(0, ABCD);
219  const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220  const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221  const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222  const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223  LOAD_ZERO;
224  const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225  const vec_u16 v6us = vec_splat_u16(6);
226  register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227  register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
228 
229  vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230  vec_u8 vsrc0uc, vsrc1uc;
231  vec_s16 vsrc0ssH, vsrc1ssH;
232  vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233  vec_s16 vsrc2ssH, vsrc3ssH, psum;
234  vec_u8 vdst, ppsum, vfdst, fsum;
235 
236  if (((unsigned long)dst) % 16 == 0) {
237  fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238  0x14, 0x15, 0x16, 0x17,
239  0x08, 0x09, 0x0A, 0x0B,
240  0x0C, 0x0D, 0x0E, 0x0F};
241  } else {
242  fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243  0x04, 0x05, 0x06, 0x07,
244  0x18, 0x19, 0x1A, 0x1B,
245  0x1C, 0x1D, 0x1E, 0x1F};
246  }
247 
248  vsrcAuc = vec_ld(0, src);
249 
250  if (loadSecond)
251  vsrcBuc = vec_ld(16, src);
252  vsrcperm0 = vec_lvsl(0, src);
253  vsrcperm1 = vec_lvsl(1, src);
254 
255  vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
256  if (reallyBadAlign)
257  vsrc1uc = vsrcBuc;
258  else
259  vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
260 
261  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
263 
264  if (!loadSecond) {// -> !reallyBadAlign
265  for (i = 0 ; i < h ; i++) {
266 
267 
268  vsrcCuc = vec_ld(stride + 0, src);
269 
270  vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271  vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
272 
273  CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
274  }
275  } else {
276  vec_u8 vsrcDuc;
277  for (i = 0 ; i < h ; i++) {
278  vsrcCuc = vec_ld(stride + 0, src);
279  vsrcDuc = vec_ld(stride + 16, src);
280 
281  vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
282  if (reallyBadAlign)
283  vsrc3uc = vsrcDuc;
284  else
285  vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
286 
287  CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
288  }
289  }
290 }
291 #endif
292 
293 #undef noop
294 #undef add28
295 #undef CHROMA_MC8_ALTIVEC_CORE
296 
297 /* this code assume stride % 16 == 0 */
298 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
299 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
300  register int i;
301 
302  LOAD_ZERO;
303  const vec_u8 permM2 = vec_lvsl(-2, src);
304  const vec_u8 permM1 = vec_lvsl(-1, src);
305  const vec_u8 permP0 = vec_lvsl(+0, src);
306  const vec_u8 permP1 = vec_lvsl(+1, src);
307  const vec_u8 permP2 = vec_lvsl(+2, src);
308  const vec_u8 permP3 = vec_lvsl(+3, src);
309  const vec_s16 v5ss = vec_splat_s16(5);
310  const vec_u16 v5us = vec_splat_u16(5);
311  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
312  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
313 
314  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
315 
316  register int align = ((((unsigned long)src) - 2) % 16);
317 
318  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319  srcP2A, srcP2B, srcP3A, srcP3B,
320  srcM1A, srcM1B, srcM2A, srcM2B,
321  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
323  psumA, psumB, sumA, sumB;
324 
325  vec_u8 sum, fsum;
326 
327  for (i = 0 ; i < 16 ; i ++) {
328  vec_u8 srcR1 = vec_ld(-2, src);
329  vec_u8 srcR2 = vec_ld(14, src);
330 
331  switch (align) {
332  default: {
333  srcM2 = vec_perm(srcR1, srcR2, permM2);
334  srcM1 = vec_perm(srcR1, srcR2, permM1);
335  srcP0 = vec_perm(srcR1, srcR2, permP0);
336  srcP1 = vec_perm(srcR1, srcR2, permP1);
337  srcP2 = vec_perm(srcR1, srcR2, permP2);
338  srcP3 = vec_perm(srcR1, srcR2, permP3);
339  } break;
340  case 11: {
341  srcM2 = vec_perm(srcR1, srcR2, permM2);
342  srcM1 = vec_perm(srcR1, srcR2, permM1);
343  srcP0 = vec_perm(srcR1, srcR2, permP0);
344  srcP1 = vec_perm(srcR1, srcR2, permP1);
345  srcP2 = vec_perm(srcR1, srcR2, permP2);
346  srcP3 = srcR2;
347  } break;
348  case 12: {
349  vec_u8 srcR3 = vec_ld(30, src);
350  srcM2 = vec_perm(srcR1, srcR2, permM2);
351  srcM1 = vec_perm(srcR1, srcR2, permM1);
352  srcP0 = vec_perm(srcR1, srcR2, permP0);
353  srcP1 = vec_perm(srcR1, srcR2, permP1);
354  srcP2 = srcR2;
355  srcP3 = vec_perm(srcR2, srcR3, permP3);
356  } break;
357  case 13: {
358  vec_u8 srcR3 = vec_ld(30, src);
359  srcM2 = vec_perm(srcR1, srcR2, permM2);
360  srcM1 = vec_perm(srcR1, srcR2, permM1);
361  srcP0 = vec_perm(srcR1, srcR2, permP0);
362  srcP1 = srcR2;
363  srcP2 = vec_perm(srcR2, srcR3, permP2);
364  srcP3 = vec_perm(srcR2, srcR3, permP3);
365  } break;
366  case 14: {
367  vec_u8 srcR3 = vec_ld(30, src);
368  srcM2 = vec_perm(srcR1, srcR2, permM2);
369  srcM1 = vec_perm(srcR1, srcR2, permM1);
370  srcP0 = srcR2;
371  srcP1 = vec_perm(srcR2, srcR3, permP1);
372  srcP2 = vec_perm(srcR2, srcR3, permP2);
373  srcP3 = vec_perm(srcR2, srcR3, permP3);
374  } break;
375  case 15: {
376  vec_u8 srcR3 = vec_ld(30, src);
377  srcM2 = vec_perm(srcR1, srcR2, permM2);
378  srcM1 = srcR2;
379  srcP0 = vec_perm(srcR2, srcR3, permP0);
380  srcP1 = vec_perm(srcR2, srcR3, permP1);
381  srcP2 = vec_perm(srcR2, srcR3, permP2);
382  srcP3 = vec_perm(srcR2, srcR3, permP3);
383  } break;
384  }
385 
386  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
387  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
388  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
389  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
390 
391  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
392  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
393  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
394  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
395 
396  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
397  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
398  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
399  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
400 
401  sum1A = vec_adds(srcP0A, srcP1A);
402  sum1B = vec_adds(srcP0B, srcP1B);
403  sum2A = vec_adds(srcM1A, srcP2A);
404  sum2B = vec_adds(srcM1B, srcP2B);
405  sum3A = vec_adds(srcM2A, srcP3A);
406  sum3B = vec_adds(srcM2B, srcP3B);
407 
408  pp1A = vec_mladd(sum1A, v20ss, v16ss);
409  pp1B = vec_mladd(sum1B, v20ss, v16ss);
410 
411  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
412  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
413 
414  pp3A = vec_add(sum3A, pp1A);
415  pp3B = vec_add(sum3B, pp1B);
416 
417  psumA = vec_sub(pp3A, pp2A);
418  psumB = vec_sub(pp3B, pp2B);
419 
420  sumA = vec_sra(psumA, v5us);
421  sumB = vec_sra(psumB, v5us);
422 
423  sum = vec_packsu(sumA, sumB);
424 
425  ASSERT_ALIGNED(dst);
426 
427  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
428 
429  vec_st(fsum, 0, dst);
430 
431  src += srcStride;
432  dst += dstStride;
433  }
434 }
435 #endif
436 
437 /* this code assume stride % 16 == 0 */
438 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
439 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
440  register int i;
441 
442  LOAD_ZERO;
443  const vec_u8 perm = vec_lvsl(0, src);
444  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
445  const vec_u16 v5us = vec_splat_u16(5);
446  const vec_s16 v5ss = vec_splat_s16(5);
447  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
448 
449  uint8_t *srcbis = src - (srcStride * 2);
450 
451  const vec_u8 srcM2a = vec_ld(0, srcbis);
452  const vec_u8 srcM2b = vec_ld(16, srcbis);
453  const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
454  //srcbis += srcStride;
455  const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
456  const vec_u8 srcM1b = vec_ld(16, srcbis);
457  const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
458  //srcbis += srcStride;
459  const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
460  const vec_u8 srcP0b = vec_ld(16, srcbis);
461  const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
462  //srcbis += srcStride;
463  const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
464  const vec_u8 srcP1b = vec_ld(16, srcbis);
465  const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
466  //srcbis += srcStride;
467  const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
468  const vec_u8 srcP2b = vec_ld(16, srcbis);
469  const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
470  //srcbis += srcStride;
471 
472  vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
473  vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
474  vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
475  vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
476  vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
477  vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
478  vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
479  vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
480  vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
481  vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
482 
483  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
484  psumA, psumB, sumA, sumB,
485  srcP3ssA, srcP3ssB,
486  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
487 
488  vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
489 
490  for (i = 0 ; i < 16 ; i++) {
491  srcP3a = vec_ld(0, srcbis += srcStride);
492  srcP3b = vec_ld(16, srcbis);
493  srcP3 = vec_perm(srcP3a, srcP3b, perm);
494  srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
495  srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
496  //srcbis += srcStride;
497 
498  sum1A = vec_adds(srcP0ssA, srcP1ssA);
499  sum1B = vec_adds(srcP0ssB, srcP1ssB);
500  sum2A = vec_adds(srcM1ssA, srcP2ssA);
501  sum2B = vec_adds(srcM1ssB, srcP2ssB);
502  sum3A = vec_adds(srcM2ssA, srcP3ssA);
503  sum3B = vec_adds(srcM2ssB, srcP3ssB);
504 
505  srcM2ssA = srcM1ssA;
506  srcM2ssB = srcM1ssB;
507  srcM1ssA = srcP0ssA;
508  srcM1ssB = srcP0ssB;
509  srcP0ssA = srcP1ssA;
510  srcP0ssB = srcP1ssB;
511  srcP1ssA = srcP2ssA;
512  srcP1ssB = srcP2ssB;
513  srcP2ssA = srcP3ssA;
514  srcP2ssB = srcP3ssB;
515 
516  pp1A = vec_mladd(sum1A, v20ss, v16ss);
517  pp1B = vec_mladd(sum1B, v20ss, v16ss);
518 
519  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
520  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
521 
522  pp3A = vec_add(sum3A, pp1A);
523  pp3B = vec_add(sum3B, pp1B);
524 
525  psumA = vec_sub(pp3A, pp2A);
526  psumB = vec_sub(pp3B, pp2B);
527 
528  sumA = vec_sra(psumA, v5us);
529  sumB = vec_sra(psumB, v5us);
530 
531  sum = vec_packsu(sumA, sumB);
532 
533  ASSERT_ALIGNED(dst);
534 
535  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
536 
537  vec_st(fsum, 0, dst);
538 
539  dst += dstStride;
540  }
541 }
542 #endif
543 
544 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
547  register int i;
548  LOAD_ZERO;
549  const vec_u8 permM2 = vec_lvsl(-2, src);
550  const vec_u8 permM1 = vec_lvsl(-1, src);
551  const vec_u8 permP0 = vec_lvsl(+0, src);
552  const vec_u8 permP1 = vec_lvsl(+1, src);
553  const vec_u8 permP2 = vec_lvsl(+2, src);
554  const vec_u8 permP3 = vec_lvsl(+3, src);
555  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556  const vec_u32 v10ui = vec_splat_u32(10);
557  const vec_s16 v5ss = vec_splat_s16(5);
558  const vec_s16 v1ss = vec_splat_s16(1);
559  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
561 
562  register int align = ((((unsigned long)src) - 2) % 16);
563 
564  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565  srcP2A, srcP2B, srcP3A, srcP3B,
566  srcM1A, srcM1B, srcM2A, srcM2B,
567  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
569 
570  const vec_u8 mperm = (const vec_u8)
571  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573  int16_t *tmpbis = tmp;
574 
575  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
577  tmpP2ssA, tmpP2ssB;
578 
579  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582  ssumAe, ssumAo, ssumBe, ssumBo;
583  vec_u8 fsum, sumv, sum;
584  vec_s16 ssume, ssumo;
585 
586  src -= (2 * srcStride);
587  for (i = 0 ; i < 21 ; i ++) {
588  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589  vec_u8 srcR1 = vec_ld(-2, src);
590  vec_u8 srcR2 = vec_ld(14, src);
591 
592  switch (align) {
593  default: {
594  srcM2 = vec_perm(srcR1, srcR2, permM2);
595  srcM1 = vec_perm(srcR1, srcR2, permM1);
596  srcP0 = vec_perm(srcR1, srcR2, permP0);
597  srcP1 = vec_perm(srcR1, srcR2, permP1);
598  srcP2 = vec_perm(srcR1, srcR2, permP2);
599  srcP3 = vec_perm(srcR1, srcR2, permP3);
600  } break;
601  case 11: {
602  srcM2 = vec_perm(srcR1, srcR2, permM2);
603  srcM1 = vec_perm(srcR1, srcR2, permM1);
604  srcP0 = vec_perm(srcR1, srcR2, permP0);
605  srcP1 = vec_perm(srcR1, srcR2, permP1);
606  srcP2 = vec_perm(srcR1, srcR2, permP2);
607  srcP3 = srcR2;
608  } break;
609  case 12: {
610  vec_u8 srcR3 = vec_ld(30, src);
611  srcM2 = vec_perm(srcR1, srcR2, permM2);
612  srcM1 = vec_perm(srcR1, srcR2, permM1);
613  srcP0 = vec_perm(srcR1, srcR2, permP0);
614  srcP1 = vec_perm(srcR1, srcR2, permP1);
615  srcP2 = srcR2;
616  srcP3 = vec_perm(srcR2, srcR3, permP3);
617  } break;
618  case 13: {
619  vec_u8 srcR3 = vec_ld(30, src);
620  srcM2 = vec_perm(srcR1, srcR2, permM2);
621  srcM1 = vec_perm(srcR1, srcR2, permM1);
622  srcP0 = vec_perm(srcR1, srcR2, permP0);
623  srcP1 = srcR2;
624  srcP2 = vec_perm(srcR2, srcR3, permP2);
625  srcP3 = vec_perm(srcR2, srcR3, permP3);
626  } break;
627  case 14: {
628  vec_u8 srcR3 = vec_ld(30, src);
629  srcM2 = vec_perm(srcR1, srcR2, permM2);
630  srcM1 = vec_perm(srcR1, srcR2, permM1);
631  srcP0 = srcR2;
632  srcP1 = vec_perm(srcR2, srcR3, permP1);
633  srcP2 = vec_perm(srcR2, srcR3, permP2);
634  srcP3 = vec_perm(srcR2, srcR3, permP3);
635  } break;
636  case 15: {
637  vec_u8 srcR3 = vec_ld(30, src);
638  srcM2 = vec_perm(srcR1, srcR2, permM2);
639  srcM1 = srcR2;
640  srcP0 = vec_perm(srcR2, srcR3, permP0);
641  srcP1 = vec_perm(srcR2, srcR3, permP1);
642  srcP2 = vec_perm(srcR2, srcR3, permP2);
643  srcP3 = vec_perm(srcR2, srcR3, permP3);
644  } break;
645  }
646 
647  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
648  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
649  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
650  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
651 
652  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
653  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
654  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
655  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
656 
657  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
658  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
659  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
660  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
661 
662  sum1A = vec_adds(srcP0A, srcP1A);
663  sum1B = vec_adds(srcP0B, srcP1B);
664  sum2A = vec_adds(srcM1A, srcP2A);
665  sum2B = vec_adds(srcM1B, srcP2B);
666  sum3A = vec_adds(srcM2A, srcP3A);
667  sum3B = vec_adds(srcM2B, srcP3B);
668 
669  pp1A = vec_mladd(sum1A, v20ss, sum3A);
670  pp1B = vec_mladd(sum1B, v20ss, sum3B);
671 
672  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
673  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
674 
675  psumA = vec_sub(pp1A, pp2A);
676  psumB = vec_sub(pp1B, pp2B);
677 
678  vec_st(psumA, 0, tmp);
679  vec_st(psumB, 16, tmp);
680 
681  src += srcStride;
682  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
683  }
684 
685  tmpM2ssA = vec_ld(0, tmpbis);
686  tmpM2ssB = vec_ld(16, tmpbis);
687  tmpbis += tmpStride;
688  tmpM1ssA = vec_ld(0, tmpbis);
689  tmpM1ssB = vec_ld(16, tmpbis);
690  tmpbis += tmpStride;
691  tmpP0ssA = vec_ld(0, tmpbis);
692  tmpP0ssB = vec_ld(16, tmpbis);
693  tmpbis += tmpStride;
694  tmpP1ssA = vec_ld(0, tmpbis);
695  tmpP1ssB = vec_ld(16, tmpbis);
696  tmpbis += tmpStride;
697  tmpP2ssA = vec_ld(0, tmpbis);
698  tmpP2ssB = vec_ld(16, tmpbis);
699  tmpbis += tmpStride;
700 
701  for (i = 0 ; i < 16 ; i++) {
702  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
704 
705  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709  const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710  const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
711 
712  tmpbis += tmpStride;
713 
714  tmpM2ssA = tmpM1ssA;
715  tmpM2ssB = tmpM1ssB;
716  tmpM1ssA = tmpP0ssA;
717  tmpM1ssB = tmpP0ssB;
718  tmpP0ssA = tmpP1ssA;
719  tmpP0ssB = tmpP1ssB;
720  tmpP1ssA = tmpP2ssA;
721  tmpP1ssB = tmpP2ssB;
722  tmpP2ssA = tmpP3ssA;
723  tmpP2ssB = tmpP3ssB;
724 
725  pp1Ae = vec_mule(sum1A, v20ss);
726  pp1Ao = vec_mulo(sum1A, v20ss);
727  pp1Be = vec_mule(sum1B, v20ss);
728  pp1Bo = vec_mulo(sum1B, v20ss);
729 
730  pp2Ae = vec_mule(sum2A, v5ss);
731  pp2Ao = vec_mulo(sum2A, v5ss);
732  pp2Be = vec_mule(sum2B, v5ss);
733  pp2Bo = vec_mulo(sum2B, v5ss);
734 
735  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
736  pp3Ao = vec_mulo(sum3A, v1ss);
737  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
738  pp3Bo = vec_mulo(sum3B, v1ss);
739 
740  pp1cAe = vec_add(pp1Ae, v512si);
741  pp1cAo = vec_add(pp1Ao, v512si);
742  pp1cBe = vec_add(pp1Be, v512si);
743  pp1cBo = vec_add(pp1Bo, v512si);
744 
745  pp32Ae = vec_sub(pp3Ae, pp2Ae);
746  pp32Ao = vec_sub(pp3Ao, pp2Ao);
747  pp32Be = vec_sub(pp3Be, pp2Be);
748  pp32Bo = vec_sub(pp3Bo, pp2Bo);
749 
750  sumAe = vec_add(pp1cAe, pp32Ae);
751  sumAo = vec_add(pp1cAo, pp32Ao);
752  sumBe = vec_add(pp1cBe, pp32Be);
753  sumBo = vec_add(pp1cBo, pp32Bo);
754 
755  ssumAe = vec_sra(sumAe, v10ui);
756  ssumAo = vec_sra(sumAo, v10ui);
757  ssumBe = vec_sra(sumBe, v10ui);
758  ssumBo = vec_sra(sumBo, v10ui);
759 
760  ssume = vec_packs(ssumAe, ssumBe);
761  ssumo = vec_packs(ssumAo, ssumBo);
762 
763  sumv = vec_packsu(ssume, ssumo);
764  sum = vec_perm(sumv, sumv, mperm);
765 
766  ASSERT_ALIGNED(dst);
767 
768  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
769 
770  vec_st(fsum, 0, dst);
771 
772  dst += dstStride;
773  }
774 }
775 #endif