FFmpeg
postprocess_altivec_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avutil.h"
24 #include "libavutil/mem_internal.h"
25 
26 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
27  do { \
28  __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
29  __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
30  __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
31  __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
32  tempA1 = vec_mergeh (src_a, src_e); \
33  tempB1 = vec_mergel (src_a, src_e); \
34  tempC1 = vec_mergeh (src_b, src_f); \
35  tempD1 = vec_mergel (src_b, src_f); \
36  tempE1 = vec_mergeh (src_c, src_g); \
37  tempF1 = vec_mergel (src_c, src_g); \
38  tempG1 = vec_mergeh (src_d, src_h); \
39  tempH1 = vec_mergel (src_d, src_h); \
40  tempA2 = vec_mergeh (tempA1, tempE1); \
41  tempB2 = vec_mergel (tempA1, tempE1); \
42  tempC2 = vec_mergeh (tempB1, tempF1); \
43  tempD2 = vec_mergel (tempB1, tempF1); \
44  tempE2 = vec_mergeh (tempC1, tempG1); \
45  tempF2 = vec_mergel (tempC1, tempG1); \
46  tempG2 = vec_mergeh (tempD1, tempH1); \
47  tempH2 = vec_mergel (tempD1, tempH1); \
48  src_a = vec_mergeh (tempA2, tempE2); \
49  src_b = vec_mergel (tempA2, tempE2); \
50  src_c = vec_mergeh (tempB2, tempF2); \
51  src_d = vec_mergel (tempB2, tempF2); \
52  src_e = vec_mergeh (tempC2, tempG2); \
53  src_f = vec_mergel (tempC2, tempG2); \
54  src_g = vec_mergeh (tempD2, tempH2); \
55  src_h = vec_mergel (tempD2, tempH2); \
56  } while (0)
57 
58 
59 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
60  /*
61  this code makes no assumption on src or stride.
62  One could remove the recomputation of the perm
63  vector by assuming (stride % 16) == 0, unfortunately
64  this is not always true.
65  */
66  short data_0 = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
67  DECLARE_ALIGNED(16, short, data)[8] =
68  {
69  data_0,
70  data_0 * 2 + 1,
71  c->QP * 2,
72  c->QP * 4
73  };
74  int numEq;
75  uint8_t *src2 = src;
76  vector signed short v_dcOffset;
77  vector signed short v2QP;
78  vector unsigned short v4QP;
79  vector unsigned short v_dcThreshold;
80  const int properStride = (stride % 16);
81  const int srcAlign = ((unsigned long)src2 % 16);
82  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
83  const vector signed int zero = vec_splat_s32(0);
84  const vector signed short mask = vec_splat_s16(1);
85  vector signed int v_numEq = vec_splat_s32(0);
86  vector signed short v_data = vec_ld(0, data);
87  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3,
88  v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
89 //FIXME avoid this mess if possible
90  register int j0 = 0,
91  j1 = stride,
92  j2 = 2 * stride,
93  j3 = 3 * stride,
94  j4 = 4 * stride,
95  j5 = 5 * stride,
96  j6 = 6 * stride,
97  j7 = 7 * stride;
98  vector unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3,
99  v_srcA4, v_srcA5, v_srcA6, v_srcA7;
100 
101  v_dcOffset = vec_splat(v_data, 0);
102  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
103  v2QP = vec_splat(v_data, 2);
104  v4QP = (vector unsigned short)vec_splat(v_data, 3);
105 
106  src2 += stride * 4;
107 
108 #define LOAD_LINE(i) \
109  { \
110  vector unsigned char perm##i = vec_lvsl(j##i, src2); \
111  vector unsigned char v_srcA2##i; \
112  vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
113  if (two_vectors) \
114  v_srcA2##i = vec_ld(j##i + 16, src2); \
115  v_srcA##i = \
116  vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
117  v_srcAss##i = \
118  (vector signed short)vec_mergeh((vector signed char)zero, \
119  (vector signed char)v_srcA##i); }
120 
121 #define LOAD_LINE_ALIGNED(i) \
122  v_srcA##i = vec_ld(j##i, src2); \
123  v_srcAss##i = \
124  (vector signed short)vec_mergeh((vector signed char)zero, \
125  (vector signed char)v_srcA##i)
126 
127  /* Special-casing the aligned case is worthwhile, as all calls from
128  * the (transposed) horizontable deblocks will be aligned, in addition
129  * to the naturally aligned vertical deblocks. */
130  if (properStride && srcAlign) {
139  } else {
140  LOAD_LINE(0);
141  LOAD_LINE(1);
142  LOAD_LINE(2);
143  LOAD_LINE(3);
144  LOAD_LINE(4);
145  LOAD_LINE(5);
146  LOAD_LINE(6);
147  LOAD_LINE(7);
148  }
149 #undef LOAD_LINE
150 #undef LOAD_LINE_ALIGNED
151 
152 #define ITER(i, j) \
153  const vector signed short v_diff##i = \
154  vec_sub(v_srcAss##i, v_srcAss##j); \
155  const vector signed short v_sum##i = \
156  vec_add(v_diff##i, v_dcOffset); \
157  const vector signed short v_comp##i = \
158  (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
159  v_dcThreshold); \
160  const vector signed short v_part##i = vec_and(mask, v_comp##i);
161 
162  {
163  ITER(0, 1)
164  ITER(1, 2)
165  ITER(2, 3)
166  ITER(3, 4)
167  ITER(4, 5)
168  ITER(5, 6)
169  ITER(6, 7)
170 
171  v_numEq = vec_sum4s(v_part0, v_numEq);
172  v_numEq = vec_sum4s(v_part1, v_numEq);
173  v_numEq = vec_sum4s(v_part2, v_numEq);
174  v_numEq = vec_sum4s(v_part3, v_numEq);
175  v_numEq = vec_sum4s(v_part4, v_numEq);
176  v_numEq = vec_sum4s(v_part5, v_numEq);
177  v_numEq = vec_sum4s(v_part6, v_numEq);
178  }
179 
180 #undef ITER
181 
182  v_numEq = vec_sums(v_numEq, zero);
183 
184  v_numEq = vec_splat(v_numEq, 3);
185  vec_ste(v_numEq, 0, &numEq);
186 
187  if (numEq > c->ppMode.flatnessThreshold){
188  const vector unsigned char mmoP1 = (const vector unsigned char)
189  {0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
190  0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B};
191  const vector unsigned char mmoP2 = (const vector unsigned char)
192  {0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
193  0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f};
194  const vector unsigned char mmoP = (const vector unsigned char)
195  vec_lvsl(8, (unsigned char*)0);
196 
197  vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
198  vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
199  vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
200  vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
201  vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
202  vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
203  vector signed short mmoDiff = vec_sub(mmoL, mmoR);
204  vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
205 
206  if (vec_any_gt(mmoSum, v4QP))
207  return 0;
208  else
209  return 1;
210  }
211  else return 2;
212 }
213 
214 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
215  /*
216  this code makes no assumption on src or stride.
217  One could remove the recomputation of the perm
218  vector by assuming (stride % 16) == 0, unfortunately
219  this is not always true. Quite a lot of load/stores
220  can be removed by assuming proper alignment of
221  src & stride :-(
222  */
223  uint8_t *src2 = src;
224  const vector signed int zero = vec_splat_s32(0);
225  const int properStride = (stride % 16);
226  const int srcAlign = ((unsigned long)src2 % 16);
227  DECLARE_ALIGNED(16, short, qp)[8] = {c->QP};
228  vector signed short vqp = vec_ld(0, qp);
229  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
230  vector unsigned char vbA0, av_uninit(vbA1), av_uninit(vbA2), av_uninit(vbA3), av_uninit(vbA4), av_uninit(vbA5), av_uninit(vbA6), av_uninit(vbA7), av_uninit(vbA8), vbA9;
231  vector unsigned char vbB0, av_uninit(vbB1), av_uninit(vbB2), av_uninit(vbB3), av_uninit(vbB4), av_uninit(vbB5), av_uninit(vbB6), av_uninit(vbB7), av_uninit(vbB8), vbB9;
232  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
233  vector unsigned char perml0, perml1, perml2, perml3, perml4,
234  perml5, perml6, perml7, perml8, perml9;
235  register int j0 = 0,
236  j1 = stride,
237  j2 = 2 * stride,
238  j3 = 3 * stride,
239  j4 = 4 * stride,
240  j5 = 5 * stride,
241  j6 = 6 * stride,
242  j7 = 7 * stride,
243  j8 = 8 * stride,
244  j9 = 9 * stride;
245 
246  vqp = vec_splat(vqp, 0);
247 
248  src2 += stride*3;
249 
250 #define LOAD_LINE(i) \
251  perml##i = vec_lvsl(i * stride, src2); \
252  vbA##i = vec_ld(i * stride, src2); \
253  vbB##i = vec_ld(i * stride + 16, src2); \
254  vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \
255  vb##i = \
256  (vector signed short)vec_mergeh((vector unsigned char)zero, \
257  (vector unsigned char)vbT##i)
258 
259 #define LOAD_LINE_ALIGNED(i) \
260  vbT##i = vec_ld(j##i, src2); \
261  vb##i = \
262  (vector signed short)vec_mergeh((vector signed char)zero, \
263  (vector signed char)vbT##i)
264 
265  /* Special-casing the aligned case is worthwhile, as all calls from
266  * the (transposed) horizontable deblocks will be aligned, in addition
267  * to the naturally aligned vertical deblocks. */
268  if (properStride && srcAlign) {
279  } else {
280  LOAD_LINE(0);
281  LOAD_LINE(1);
282  LOAD_LINE(2);
283  LOAD_LINE(3);
284  LOAD_LINE(4);
285  LOAD_LINE(5);
286  LOAD_LINE(6);
287  LOAD_LINE(7);
288  LOAD_LINE(8);
289  LOAD_LINE(9);
290  }
291 #undef LOAD_LINE
292 #undef LOAD_LINE_ALIGNED
293  {
294  const vector unsigned short v_2 = vec_splat_u16(2);
295  const vector unsigned short v_4 = vec_splat_u16(4);
296 
297  const vector signed short v_diff01 = vec_sub(vb0, vb1);
298  const vector unsigned short v_cmp01 =
299  (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
300  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
301  const vector signed short v_diff89 = vec_sub(vb8, vb9);
302  const vector unsigned short v_cmp89 =
303  (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
304  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
305 
306  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
307  const vector signed short temp02 = vec_add(vb2, vb3);
308  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
309  const vector signed short v_sumsB0 = vec_add(temp02, temp03);
310 
311  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
312  const vector signed short v_sumsB1 = vec_add(temp11, vb4);
313 
314  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
315  const vector signed short v_sumsB2 = vec_add(temp21, vb5);
316 
317  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
318  const vector signed short v_sumsB3 = vec_add(temp31, vb6);
319 
320  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
321  const vector signed short v_sumsB4 = vec_add(temp41, vb7);
322 
323  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
324  const vector signed short v_sumsB5 = vec_add(temp51, vb8);
325 
326  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
327  const vector signed short v_sumsB6 = vec_add(temp61, v_last);
328 
329  const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
330  const vector signed short v_sumsB7 = vec_add(temp71, v_last);
331 
332  const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
333  const vector signed short v_sumsB8 = vec_add(temp81, v_last);
334 
335  const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
336  const vector signed short v_sumsB9 = vec_add(temp91, v_last);
337 
338  #define COMPUTE_VR(i, j, k) \
339  const vector signed short temps1##i = \
340  vec_add(v_sumsB##i, v_sumsB##k); \
341  const vector signed short temps2##i = \
342  vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
343  const vector signed short vr##j = vec_sra(temps2##i, v_4)
344 
345  COMPUTE_VR(0, 1, 2);
346  COMPUTE_VR(1, 2, 3);
347  COMPUTE_VR(2, 3, 4);
348  COMPUTE_VR(3, 4, 5);
349  COMPUTE_VR(4, 5, 6);
350  COMPUTE_VR(5, 6, 7);
351  COMPUTE_VR(6, 7, 8);
352  COMPUTE_VR(7, 8, 9);
353 
354  const vector signed char neg1 = vec_splat_s8(-1);
355  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
356  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
357 
358 #define PACK_AND_STORE(i) \
359 { const vector unsigned char perms##i = \
360  vec_lvsr(i * stride, src2); \
361  const vector unsigned char vf##i = \
362  vec_packsu(vr##i, (vector signed short)zero); \
363  const vector unsigned char vg##i = \
364  vec_perm(vf##i, vbT##i, permHH); \
365  const vector unsigned char mask##i = \
366  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
367  const vector unsigned char vg2##i = \
368  vec_perm(vg##i, vg##i, perms##i); \
369  const vector unsigned char svA##i = \
370  vec_sel(vbA##i, vg2##i, mask##i); \
371  const vector unsigned char svB##i = \
372  vec_sel(vg2##i, vbB##i, mask##i); \
373  vec_st(svA##i, i * stride, src2); \
374  vec_st(svB##i, i * stride + 16, src2);}
375 
376 #define PACK_AND_STORE_ALIGNED(i) \
377 { const vector unsigned char vf##i = \
378  vec_packsu(vr##i, (vector signed short)zero); \
379  const vector unsigned char vg##i = \
380  vec_perm(vf##i, vbT##i, permHH); \
381  vec_st(vg##i, i * stride, src2);}
382 
383  /* Special-casing the aligned case is worthwhile, as all calls from
384  * the (transposed) horizontable deblocks will be aligned, in addition
385  * to the naturally aligned vertical deblocks. */
386  if (properStride && srcAlign) {
395  } else {
396  PACK_AND_STORE(1)
397  PACK_AND_STORE(2)
398  PACK_AND_STORE(3)
399  PACK_AND_STORE(4)
400  PACK_AND_STORE(5)
401  PACK_AND_STORE(6)
402  PACK_AND_STORE(7)
403  PACK_AND_STORE(8)
404  }
405  #undef PACK_AND_STORE
406  #undef PACK_AND_STORE_ALIGNED
407  }
408 }
409 
410 
411 
412 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
413  /*
414  this code makes no assumption on src or stride.
415  One could remove the recomputation of the perm
416  vector by assuming (stride % 16) == 0, unfortunately
417  this is not always true. Quite a lot of load/stores
418  can be removed by assuming proper alignment of
419  src & stride :-(
420  */
421  uint8_t *src2 = src + stride*3;
422  const vector signed int zero = vec_splat_s32(0);
423  DECLARE_ALIGNED(16, short, qp)[8] = {8*c->QP};
424  vector signed short vqp = vec_splat(
425  (vector signed short)vec_ld(0, qp), 0);
426 
427 #define LOAD_LINE(i) \
428  const vector unsigned char perm##i = \
429  vec_lvsl(i * stride, src2); \
430  const vector unsigned char vbA##i = \
431  vec_ld(i * stride, src2); \
432  const vector unsigned char vbB##i = \
433  vec_ld(i * stride + 16, src2); \
434  const vector unsigned char vbT##i = \
435  vec_perm(vbA##i, vbB##i, perm##i); \
436  const vector signed short vb##i = \
437  (vector signed short)vec_mergeh((vector unsigned char)zero, \
438  (vector unsigned char)vbT##i)
439 
440  LOAD_LINE(1);
441  LOAD_LINE(2);
442  LOAD_LINE(3);
443  LOAD_LINE(4);
444  LOAD_LINE(5);
445  LOAD_LINE(6);
446  LOAD_LINE(7);
447  LOAD_LINE(8);
448 #undef LOAD_LINE
449 
450  const vector signed short v_1 = vec_splat_s16(1);
451  const vector signed short v_2 = vec_splat_s16(2);
452  const vector signed short v_5 = vec_splat_s16(5);
453  const vector signed short v_32 = vec_sl(v_1,
454  (vector unsigned short)v_5);
455  /* middle energy */
456  const vector signed short l3minusl6 = vec_sub(vb3, vb6);
457  const vector signed short l5minusl4 = vec_sub(vb5, vb4);
458  const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
459  const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
460  const vector signed short absmE = vec_abs(mE);
461  /* left & right energy */
462  const vector signed short l1minusl4 = vec_sub(vb1, vb4);
463  const vector signed short l3minusl2 = vec_sub(vb3, vb2);
464  const vector signed short l5minusl8 = vec_sub(vb5, vb8);
465  const vector signed short l7minusl6 = vec_sub(vb7, vb6);
466  const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
467  const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
468  const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
469  const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
470  /* d */
471  const vector signed short ddiff = vec_sub(absmE,
472  vec_min(vec_abs(lE),
473  vec_abs(rE)));
474  const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
475  const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
476  const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
477  const vector signed short minusd = vec_sub((vector signed short)zero, d);
478  const vector signed short finald = vec_sel(minusd,
479  d,
480  vec_cmpgt(vec_sub((vector signed short)zero, mE),
481  (vector signed short)zero));
482  /* q */
483  const vector signed short qtimes2 = vec_sub(vb4, vb5);
484  /* for a shift right to behave like /2, we need to add one
485  to all negative integer */
486  const vector signed short rounddown = vec_sel((vector signed short)zero,
487  v_1,
488  vec_cmplt(qtimes2, (vector signed short)zero));
489  const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
490  /* clamp */
491  const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
492  const vector signed short dclamp_P = vec_min(dclamp_P1, q);
493  const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
494  const vector signed short dclamp_N = vec_max(dclamp_N1, q);
495 
496  const vector signed short dclampedfinal = vec_sel(dclamp_N,
497  dclamp_P,
498  vec_cmpgt(q, (vector signed short)zero));
499  const vector signed short dornotd = vec_sel((vector signed short)zero,
500  dclampedfinal,
501  vec_cmplt(absmE, vqp));
502  /* add/subtract to l4 and l5 */
503  const vector signed short vb4minusd = vec_sub(vb4, dornotd);
504  const vector signed short vb5plusd = vec_add(vb5, dornotd);
505  /* finally, stores */
506  const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
507  const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
508 
509  const vector signed char neg1 = vec_splat_s8(-1);
510  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
511  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
512 
513 #define STORE(i) \
514 { const vector unsigned char perms##i = \
515  vec_lvsr(i * stride, src2); \
516  const vector unsigned char vg##i = \
517  vec_perm(st##i, vbT##i, permHH); \
518  const vector unsigned char mask##i = \
519  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
520  const vector unsigned char vg2##i = \
521  vec_perm(vg##i, vg##i, perms##i); \
522  const vector unsigned char svA##i = \
523  vec_sel(vbA##i, vg2##i, mask##i); \
524  const vector unsigned char svB##i = \
525  vec_sel(vg2##i, vbB##i, mask##i); \
526  vec_st(svA##i, i * stride, src2); \
527  vec_st(svB##i, i * stride + 16, src2);}
528 
529  STORE(4)
530  STORE(5)
531 }
532 
533 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
534  const vector signed int vsint32_8 = vec_splat_s32(8);
535  const vector unsigned int vuint32_4 = vec_splat_u32(4);
536  const vector signed char neg1 = vec_splat_s8(-1);
537 
538  const vector unsigned char permA1 = (vector unsigned char)
539  {0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
540  0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
541  const vector unsigned char permA2 = (vector unsigned char)
542  {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
543  0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
544  const vector unsigned char permA1inc = (vector unsigned char)
545  {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
546  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
547  const vector unsigned char permA2inc = (vector unsigned char)
548  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
549  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
550  const vector unsigned char magic = (vector unsigned char)
551  {0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
552  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
553  const vector unsigned char extractPerm = (vector unsigned char)
554  {0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
555  0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
556  const vector unsigned char extractPermInc = (vector unsigned char)
557  {0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
558  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
559  const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
560  const vector unsigned char tenRight = (vector unsigned char)
561  {0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
562  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
563  const vector unsigned char eightLeft = (vector unsigned char)
564  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
565  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
566 
567  /*
568  this code makes no assumption on src or stride.
569  One could remove the recomputation of the perm
570  vector by assuming (stride % 16) == 0, unfortunately
571  this is not always true. Quite a lot of load/stores
572  can be removed by assuming proper alignment of
573  src & stride :-(
574  */
575  uint8_t *srcCopy = src;
576  DECLARE_ALIGNED(16, uint8_t, dt)[16] = { deringThreshold };
577  const vector signed int zero = vec_splat_s32(0);
578  vector unsigned char v_dt = vec_splat(vec_ld(0, dt), 0);
579 
580 #define LOAD_LINE(i) \
581  const vector unsigned char perm##i = \
582  vec_lvsl(i * stride, srcCopy); \
583  vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
584  vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
585  vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
586 
587  LOAD_LINE(0);
588  LOAD_LINE(1);
589  LOAD_LINE(2);
590  LOAD_LINE(3);
591  LOAD_LINE(4);
592  LOAD_LINE(5);
593  LOAD_LINE(6);
594  LOAD_LINE(7);
595  LOAD_LINE(8);
596  LOAD_LINE(9);
597 #undef LOAD_LINE
598 
599  vector unsigned char v_avg;
600  DECLARE_ALIGNED(16, signed int, S)[8];
601  DECLARE_ALIGNED(16, int, tQP2)[4] = { c->QP/2 + 1 };
602  vector signed int vQP2 = vec_ld(0, tQP2);
603  vQP2 = vec_splat(vQP2, 0);
604 
605  {
606  const vector unsigned char trunc_perm = (vector unsigned char)
607  {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
608  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
609  const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
610  const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
611  const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
612  const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
613 
614 #define EXTRACT(op) do { \
615  const vector unsigned char s_1 = vec_##op(trunc_src12, trunc_src34); \
616  const vector unsigned char s_2 = vec_##op(trunc_src56, trunc_src78); \
617  const vector unsigned char s_6 = vec_##op(s_1, s_2); \
618  const vector unsigned char s_8h = vec_mergeh(s_6, s_6); \
619  const vector unsigned char s_8l = vec_mergel(s_6, s_6); \
620  const vector unsigned char s_9 = vec_##op(s_8h, s_8l); \
621  const vector unsigned char s_9h = vec_mergeh(s_9, s_9); \
622  const vector unsigned char s_9l = vec_mergel(s_9, s_9); \
623  const vector unsigned char s_10 = vec_##op(s_9h, s_9l); \
624  const vector unsigned char s_10h = vec_mergeh(s_10, s_10); \
625  const vector unsigned char s_10l = vec_mergel(s_10, s_10); \
626  const vector unsigned char s_11 = vec_##op(s_10h, s_10l); \
627  const vector unsigned char s_11h = vec_mergeh(s_11, s_11); \
628  const vector unsigned char s_11l = vec_mergel(s_11, s_11); \
629  v_##op = vec_##op(s_11h, s_11l); \
630 } while (0)
631 
632  vector unsigned char v_min;
633  vector unsigned char v_max;
634  EXTRACT(min);
635  EXTRACT(max);
636 #undef EXTRACT
637 
638  if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
639  return;
640 
641  v_avg = vec_avg(v_min, v_max);
642  }
643 
644  {
645  const vector unsigned short mask1 = (vector unsigned short)
646  {0x0001, 0x0002, 0x0004, 0x0008,
647  0x0010, 0x0020, 0x0040, 0x0080};
648  const vector unsigned short mask2 = (vector unsigned short)
649  {0x0100, 0x0200, 0x0000, 0x0000,
650  0x0000, 0x0000, 0x0000, 0x0000};
651 
652  const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
653  const vector unsigned int vuint32_1 = vec_splat_u32(1);
654 
655  vector signed int sumA2;
656  vector signed int sumB2;
657  vector signed int sum0, sum1, sum2, sum3, sum4;
658  vector signed int sum5, sum6, sum7, sum8, sum9;
659 
660 #define COMPARE(i) \
661  do { \
662  const vector unsigned char cmp = \
663  (vector unsigned char)vec_cmpgt(src##i, v_avg); \
664  const vector unsigned short cmpHi = \
665  (vector unsigned short)vec_mergeh(cmp, cmp); \
666  const vector unsigned short cmpLi = \
667  (vector unsigned short)vec_mergel(cmp, cmp); \
668  const vector signed short cmpHf = \
669  (vector signed short)vec_and(cmpHi, mask1); \
670  const vector signed short cmpLf = \
671  (vector signed short)vec_and(cmpLi, mask2); \
672  const vector signed int sump = vec_sum4s(cmpHf, zero); \
673  const vector signed int sumq = vec_sum4s(cmpLf, sump); \
674  sum##i = vec_sums(sumq, zero); \
675  } while (0)
676 
677  COMPARE(0);
678  COMPARE(1);
679  COMPARE(2);
680  COMPARE(3);
681  COMPARE(4);
682  COMPARE(5);
683  COMPARE(6);
684  COMPARE(7);
685  COMPARE(8);
686  COMPARE(9);
687 #undef COMPARE
688 
689  {
690  const vector signed int sump02 = vec_mergel(sum0, sum2);
691  const vector signed int sump13 = vec_mergel(sum1, sum3);
692  const vector signed int sumA = vec_mergel(sump02, sump13);
693 
694  const vector signed int sump46 = vec_mergel(sum4, sum6);
695  const vector signed int sump57 = vec_mergel(sum5, sum7);
696  const vector signed int sumB = vec_mergel(sump46, sump57);
697 
698  const vector signed int sump8A = vec_mergel(sum8, zero);
699  const vector signed int sump9B = vec_mergel(sum9, zero);
700  const vector signed int sumC = vec_mergel(sump8A, sump9B);
701 
702  const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
703  const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
704  const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
705  const vector signed int t2A = vec_or(sumA, tA);
706  const vector signed int t2B = vec_or(sumB, tB);
707  const vector signed int t2C = vec_or(sumC, tC);
708  const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
709  vec_sl(t2A, vuint32_1));
710  const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
711  vec_sl(t2B, vuint32_1));
712  const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
713  vec_sl(t2C, vuint32_1));
714  const vector signed int yA = vec_and(t2A, t3A);
715  const vector signed int yB = vec_and(t2B, t3B);
716  const vector signed int yC = vec_and(t2C, t3C);
717 
718  const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
719  const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
720  const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
721  const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
722  const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
723  const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
724  const vector signed int sumAp = vec_and(yA,
725  vec_and(sumAd4,sumAd8));
726  const vector signed int sumBp = vec_and(yB,
727  vec_and(sumBd4,sumBd8));
728  sumA2 = vec_or(sumAp,
729  vec_sra(sumAp,
730  vuint32_16));
731  sumB2 = vec_or(sumBp,
732  vec_sra(sumBp,
733  vuint32_16));
734  }
735  vec_st(sumA2, 0, S);
736  vec_st(sumB2, 16, S);
737  }
738 
739  /* I'm not sure the following is actually faster
740  than straight, unvectorized C code :-( */
741 
742 #define F_INIT() \
743  vector unsigned char tenRightM = tenRight; \
744  vector unsigned char permA1M = permA1; \
745  vector unsigned char permA2M = permA2; \
746  vector unsigned char extractPermM = extractPerm
747 
748 #define F2(i, j, k, l) \
749  if (S[i] & (1 << (l+1))) { \
750  const vector unsigned char a_A = vec_perm(src##i, src##j, permA1M); \
751  const vector unsigned char a_B = vec_perm(a_A, src##k, permA2M); \
752  const vector signed int a_sump = \
753  (vector signed int)vec_msum(a_B, magic, (vector unsigned int)zero);\
754  vector signed int F = vec_sr(vec_sums(a_sump, vsint32_8), vuint32_4); \
755  const vector signed int p = \
756  (vector signed int)vec_perm(src##j, (vector unsigned char)zero, \
757  extractPermM); \
758  const vector signed int sum = vec_add(p, vQP2); \
759  const vector signed int diff = vec_sub(p, vQP2); \
760  vector signed int newpm; \
761  vector unsigned char newpm2, mask; \
762  F = vec_splat(F, 3); \
763  if (vec_all_lt(sum, F)) \
764  newpm = sum; \
765  else if (vec_all_gt(diff, F)) \
766  newpm = diff; \
767  else newpm = F; \
768  newpm2 = vec_splat((vector unsigned char)newpm, 15); \
769  mask = vec_add(identity, tenRightM); \
770  src##j = vec_perm(src##j, newpm2, mask); \
771  } \
772  permA1M = vec_add(permA1M, permA1inc); \
773  permA2M = vec_add(permA2M, permA2inc); \
774  tenRightM = vec_sro(tenRightM, eightLeft); \
775  extractPermM = vec_add(extractPermM, extractPermInc)
776 
777 #define ITER(i, j, k) do { \
778  F_INIT(); \
779  F2(i, j, k, 0); \
780  F2(i, j, k, 1); \
781  F2(i, j, k, 2); \
782  F2(i, j, k, 3); \
783  F2(i, j, k, 4); \
784  F2(i, j, k, 5); \
785  F2(i, j, k, 6); \
786  F2(i, j, k, 7); \
787 } while (0)
788 
789  ITER(0, 1, 2);
790  ITER(1, 2, 3);
791  ITER(2, 3, 4);
792  ITER(3, 4, 5);
793  ITER(4, 5, 6);
794  ITER(5, 6, 7);
795  ITER(6, 7, 8);
796  ITER(7, 8, 9);
797 
798 #define STORE_LINE(i) do { \
799  const vector unsigned char permST = \
800  vec_lvsr(i * stride, srcCopy); \
801  const vector unsigned char maskST = \
802  vec_perm((vector unsigned char)zero, \
803  (vector unsigned char)neg1, permST); \
804  src##i = vec_perm(src##i ,src##i, permST); \
805  sA##i= vec_sel(sA##i, src##i, maskST); \
806  sB##i= vec_sel(src##i, sB##i, maskST); \
807  vec_st(sA##i, i * stride, srcCopy); \
808  vec_st(sB##i, i * stride + 16, srcCopy); \
809 } while (0)
810 
811  STORE_LINE(1);
812  STORE_LINE(2);
813  STORE_LINE(3);
814  STORE_LINE(4);
815  STORE_LINE(5);
816  STORE_LINE(6);
817  STORE_LINE(7);
818  STORE_LINE(8);
819 
820 #undef STORE_LINE
821 #undef ITER
822 #undef F2
823 }
824 
825 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
826 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
827 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
828 
829 static inline void tempNoiseReducer_altivec(uint8_t *src, int stride,
830  uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
831 {
832  const vector signed char neg1 = vec_splat_s8(-1);
833  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
834  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
835 
836  const vector signed int zero = vec_splat_s32(0);
837  const vector signed short vsint16_1 = vec_splat_s16(1);
838  vector signed int v_dp = zero;
839  vector signed int v_sysdp = zero;
840  int d, sysd, i;
841 
842 #define LOAD_LINE(src, i) \
843  register int j##src##i = i * stride; \
844  vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
845  const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
846  const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
847  const vector unsigned char v_##src##A##i = \
848  vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
849  vector signed short v_##src##Ass##i = \
850  (vector signed short)vec_mergeh((vector signed char)zero, \
851  (vector signed char)v_##src##A##i)
852 
853  LOAD_LINE(src, 0);
854  LOAD_LINE(src, 1);
855  LOAD_LINE(src, 2);
856  LOAD_LINE(src, 3);
857  LOAD_LINE(src, 4);
858  LOAD_LINE(src, 5);
859  LOAD_LINE(src, 6);
860  LOAD_LINE(src, 7);
861 
862  LOAD_LINE(tempBlurred, 0);
863  LOAD_LINE(tempBlurred, 1);
864  LOAD_LINE(tempBlurred, 2);
865  LOAD_LINE(tempBlurred, 3);
866  LOAD_LINE(tempBlurred, 4);
867  LOAD_LINE(tempBlurred, 5);
868  LOAD_LINE(tempBlurred, 6);
869  LOAD_LINE(tempBlurred, 7);
870 #undef LOAD_LINE
871 
872 #define ACCUMULATE_DIFFS(i) do { \
873  vector signed short v_d = vec_sub(v_tempBlurredAss##i, \
874  v_srcAss##i); \
875  v_dp = vec_msums(v_d, v_d, v_dp); \
876  v_sysdp = vec_msums(v_d, vsint16_1, v_sysdp); \
877  } while (0)
878 
879  ACCUMULATE_DIFFS(0);
880  ACCUMULATE_DIFFS(1);
881  ACCUMULATE_DIFFS(2);
882  ACCUMULATE_DIFFS(3);
883  ACCUMULATE_DIFFS(4);
884  ACCUMULATE_DIFFS(5);
885  ACCUMULATE_DIFFS(6);
886  ACCUMULATE_DIFFS(7);
887 #undef ACCUMULATE_DIFFS
888 
889  tempBlurredPast[127]= maxNoise[0];
890  tempBlurredPast[128]= maxNoise[1];
891  tempBlurredPast[129]= maxNoise[2];
892 
893  v_dp = vec_sums(v_dp, zero);
894  v_sysdp = vec_sums(v_sysdp, zero);
895 
896  v_dp = vec_splat(v_dp, 3);
897  v_sysdp = vec_splat(v_sysdp, 3);
898 
899  vec_ste(v_dp, 0, &d);
900  vec_ste(v_sysdp, 0, &sysd);
901 
902  i = d;
903  d = (4*d
904  +(*(tempBlurredPast-256))
905  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
906  +(*(tempBlurredPast+256))
907  +4)>>3;
908 
909  *tempBlurredPast=i;
910 
911  if (d > maxNoise[1]) {
912  if (d < maxNoise[2]) {
913 #define OP(i) v_tempBlurredAss##i = vec_avg(v_tempBlurredAss##i, v_srcAss##i);
914 
915  OP(0);
916  OP(1);
917  OP(2);
918  OP(3);
919  OP(4);
920  OP(5);
921  OP(6);
922  OP(7);
923 #undef OP
924  } else {
925 #define OP(i) v_tempBlurredAss##i = v_srcAss##i;
926 
927  OP(0);
928  OP(1);
929  OP(2);
930  OP(3);
931  OP(4);
932  OP(5);
933  OP(6);
934  OP(7);
935 #undef OP
936  }
937  } else {
938  if (d < maxNoise[0]) {
939  const vector signed short vsint16_7 = vec_splat_s16(7);
940  const vector signed short vsint16_4 = vec_splat_s16(4);
941  const vector unsigned short vuint16_3 = vec_splat_u16(3);
942 
943 #define OP(i) do { \
944  const vector signed short v_temp = \
945  vec_mladd(v_tempBlurredAss##i, vsint16_7, v_srcAss##i); \
946  const vector signed short v_temp2 = vec_add(v_temp, vsint16_4); \
947  v_tempBlurredAss##i = vec_sr(v_temp2, vuint16_3); \
948  } while (0)
949 
950  OP(0);
951  OP(1);
952  OP(2);
953  OP(3);
954  OP(4);
955  OP(5);
956  OP(6);
957  OP(7);
958 #undef OP
959  } else {
960  const vector signed short vsint16_3 = vec_splat_s16(3);
961  const vector signed short vsint16_2 = vec_splat_s16(2);
962 
963 #define OP(i) do { \
964  const vector signed short v_temp = \
965  vec_mladd(v_tempBlurredAss##i, vsint16_3, v_srcAss##i); \
966  const vector signed short v_temp2 = vec_add(v_temp, vsint16_2); \
967  v_tempBlurredAss##i = \
968  vec_sr(v_temp2, (vector unsigned short)vsint16_2); \
969  } while (0)
970 
971  OP(0);
972  OP(1);
973  OP(2);
974  OP(3);
975  OP(4);
976  OP(5);
977  OP(6);
978  OP(7);
979 #undef OP
980  }
981  }
982 
983 #define PACK_AND_STORE(src, i) do { \
984  const vector unsigned char perms = vec_lvsr(i * stride, src); \
985  const vector unsigned char vf = \
986  vec_packsu(v_tempBlurredAss##1, (vector signed short)zero); \
987  const vector unsigned char vg = vec_perm(vf, v_##src##A##i, permHH); \
988  const vector unsigned char mask = \
989  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms); \
990  const vector unsigned char vg2 = vec_perm(vg, vg, perms); \
991  const vector unsigned char svA = vec_sel(v_##src##A1##i, vg2, mask); \
992  const vector unsigned char svB = vec_sel(vg2, v_##src##A2##i, mask); \
993  vec_st(svA, i * stride, src); \
994  vec_st(svB, i * stride + 16, src); \
995 } while (0)
996 
997  PACK_AND_STORE(src, 0);
998  PACK_AND_STORE(src, 1);
999  PACK_AND_STORE(src, 2);
1000  PACK_AND_STORE(src, 3);
1001  PACK_AND_STORE(src, 4);
1002  PACK_AND_STORE(src, 5);
1003  PACK_AND_STORE(src, 6);
1004  PACK_AND_STORE(src, 7);
1005  PACK_AND_STORE(tempBlurred, 0);
1006  PACK_AND_STORE(tempBlurred, 1);
1007  PACK_AND_STORE(tempBlurred, 2);
1008  PACK_AND_STORE(tempBlurred, 3);
1009  PACK_AND_STORE(tempBlurred, 4);
1010  PACK_AND_STORE(tempBlurred, 5);
1011  PACK_AND_STORE(tempBlurred, 6);
1012  PACK_AND_STORE(tempBlurred, 7);
1013 #undef PACK_AND_STORE
1014 }
1015 
1016 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1017  const vector unsigned char zero = vec_splat_u8(0);
1018 
1019 #define LOAD_DOUBLE_LINE(i, j) \
1020  vector unsigned char perm1##i = vec_lvsl(i * stride, src); \
1021  vector unsigned char perm2##i = vec_lvsl(j * stride, src); \
1022  vector unsigned char srcA##i = vec_ld(i * stride, src); \
1023  vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
1024  vector unsigned char srcC##i = vec_ld(j * stride, src); \
1025  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
1026  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1027  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1028 
1029  LOAD_DOUBLE_LINE(0, 1);
1030  LOAD_DOUBLE_LINE(2, 3);
1031  LOAD_DOUBLE_LINE(4, 5);
1032  LOAD_DOUBLE_LINE(6, 7);
1033 #undef LOAD_DOUBLE_LINE
1034 
1035  vector unsigned char tempA = vec_mergeh(src0, zero);
1036  vector unsigned char tempB = vec_mergel(src0, zero);
1037  vector unsigned char tempC = vec_mergeh(src1, zero);
1038  vector unsigned char tempD = vec_mergel(src1, zero);
1039  vector unsigned char tempE = vec_mergeh(src2, zero);
1040  vector unsigned char tempF = vec_mergel(src2, zero);
1041  vector unsigned char tempG = vec_mergeh(src3, zero);
1042  vector unsigned char tempH = vec_mergel(src3, zero);
1043  vector unsigned char tempI = vec_mergeh(src4, zero);
1044  vector unsigned char tempJ = vec_mergel(src4, zero);
1045  vector unsigned char tempK = vec_mergeh(src5, zero);
1046  vector unsigned char tempL = vec_mergel(src5, zero);
1047  vector unsigned char tempM = vec_mergeh(src6, zero);
1048  vector unsigned char tempN = vec_mergel(src6, zero);
1049  vector unsigned char tempO = vec_mergeh(src7, zero);
1050  vector unsigned char tempP = vec_mergel(src7, zero);
1051 
1052  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1053  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1054  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1055  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1056  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1057  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1058  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1059  vector unsigned char temp7 = vec_mergel(tempD, tempL);
1060  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1061  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1062  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1063  vector unsigned char temp11 = vec_mergel(tempF, tempN);
1064  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1065  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1066  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1067  vector unsigned char temp15 = vec_mergel(tempH, tempP);
1068 
1069  tempA = vec_mergeh(temp0, temp8);
1070  tempB = vec_mergel(temp0, temp8);
1071  tempC = vec_mergeh(temp1, temp9);
1072  tempD = vec_mergel(temp1, temp9);
1073  tempE = vec_mergeh(temp2, temp10);
1074  tempF = vec_mergel(temp2, temp10);
1075  tempG = vec_mergeh(temp3, temp11);
1076  tempH = vec_mergel(temp3, temp11);
1077  tempI = vec_mergeh(temp4, temp12);
1078  tempJ = vec_mergel(temp4, temp12);
1079  tempK = vec_mergeh(temp5, temp13);
1080  tempL = vec_mergel(temp5, temp13);
1081  tempM = vec_mergeh(temp6, temp14);
1082  tempN = vec_mergel(temp6, temp14);
1083  tempO = vec_mergeh(temp7, temp15);
1084  tempP = vec_mergel(temp7, temp15);
1085 
1086  temp0 = vec_mergeh(tempA, tempI);
1087  temp1 = vec_mergel(tempA, tempI);
1088  temp2 = vec_mergeh(tempB, tempJ);
1089  temp3 = vec_mergel(tempB, tempJ);
1090  temp4 = vec_mergeh(tempC, tempK);
1091  temp5 = vec_mergel(tempC, tempK);
1092  temp6 = vec_mergeh(tempD, tempL);
1093  temp7 = vec_mergel(tempD, tempL);
1094  temp8 = vec_mergeh(tempE, tempM);
1095  temp9 = vec_mergel(tempE, tempM);
1096  temp10 = vec_mergeh(tempF, tempN);
1097  temp11 = vec_mergel(tempF, tempN);
1098  temp12 = vec_mergeh(tempG, tempO);
1099  temp13 = vec_mergel(tempG, tempO);
1100  temp14 = vec_mergeh(tempH, tempP);
1101  temp15 = vec_mergel(tempH, tempP);
1102 
1103  vec_st(temp0, 0, dst);
1104  vec_st(temp1, 16, dst);
1105  vec_st(temp2, 32, dst);
1106  vec_st(temp3, 48, dst);
1107  vec_st(temp4, 64, dst);
1108  vec_st(temp5, 80, dst);
1109  vec_st(temp6, 96, dst);
1110  vec_st(temp7, 112, dst);
1111  vec_st(temp8, 128, dst);
1112  vec_st(temp9, 144, dst);
1113  vec_st(temp10, 160, dst);
1114  vec_st(temp11, 176, dst);
1115  vec_st(temp12, 192, dst);
1116  vec_st(temp13, 208, dst);
1117  vec_st(temp14, 224, dst);
1118  vec_st(temp15, 240, dst);
1119 }
1120 
1121 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1122  const vector unsigned char zero = vec_splat_u8(0);
1123  const vector signed char neg1 = vec_splat_s8(-1);
1124 
1125 #define LOAD_DOUBLE_LINE(i, j) \
1126  vector unsigned char src##i = vec_ld(i * 16, src); \
1127  vector unsigned char src##j = vec_ld(j * 16, src)
1128 
1129  LOAD_DOUBLE_LINE(0, 1);
1130  LOAD_DOUBLE_LINE(2, 3);
1131  LOAD_DOUBLE_LINE(4, 5);
1132  LOAD_DOUBLE_LINE(6, 7);
1133  LOAD_DOUBLE_LINE(8, 9);
1134  LOAD_DOUBLE_LINE(10, 11);
1135  LOAD_DOUBLE_LINE(12, 13);
1136  LOAD_DOUBLE_LINE(14, 15);
1137 #undef LOAD_DOUBLE_LINE
1138 
1139  vector unsigned char tempA = vec_mergeh(src0, src8);
1140  vector unsigned char tempB;
1141  vector unsigned char tempC = vec_mergeh(src1, src9);
1142  vector unsigned char tempD;
1143  vector unsigned char tempE = vec_mergeh(src2, src10);
1144  vector unsigned char tempG = vec_mergeh(src3, src11);
1145  vector unsigned char tempI = vec_mergeh(src4, src12);
1146  vector unsigned char tempJ;
1147  vector unsigned char tempK = vec_mergeh(src5, src13);
1148  vector unsigned char tempL;
1149  vector unsigned char tempM = vec_mergeh(src6, src14);
1150  vector unsigned char tempO = vec_mergeh(src7, src15);
1151 
1152  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1153  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1154  vector unsigned char temp2;
1155  vector unsigned char temp3;
1156  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1157  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1158  vector unsigned char temp6;
1159  vector unsigned char temp7;
1160  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1161  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1162  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1163  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1164 
1165  tempA = vec_mergeh(temp0, temp8);
1166  tempB = vec_mergel(temp0, temp8);
1167  tempC = vec_mergeh(temp1, temp9);
1168  tempD = vec_mergel(temp1, temp9);
1169  tempI = vec_mergeh(temp4, temp12);
1170  tempJ = vec_mergel(temp4, temp12);
1171  tempK = vec_mergeh(temp5, temp13);
1172  tempL = vec_mergel(temp5, temp13);
1173 
1174  temp0 = vec_mergeh(tempA, tempI);
1175  temp1 = vec_mergel(tempA, tempI);
1176  temp2 = vec_mergeh(tempB, tempJ);
1177  temp3 = vec_mergel(tempB, tempJ);
1178  temp4 = vec_mergeh(tempC, tempK);
1179  temp5 = vec_mergel(tempC, tempK);
1180  temp6 = vec_mergeh(tempD, tempL);
1181  temp7 = vec_mergel(tempD, tempL);
1182 
1183 
1184 #define STORE_DOUBLE_LINE(i, j) do { \
1185  vector unsigned char dstAi = vec_ld(i * stride, dst); \
1186  vector unsigned char dstBi = vec_ld(i * stride + 16, dst); \
1187  vector unsigned char dstAj = vec_ld(j * stride, dst); \
1188  vector unsigned char dstBj = vec_ld(j * stride+ 16, dst); \
1189  vector unsigned char aligni = vec_lvsr(i * stride, dst); \
1190  vector unsigned char alignj = vec_lvsr(j * stride, dst); \
1191  vector unsigned char maski = \
1192  vec_perm(zero, (vector unsigned char)neg1, aligni); \
1193  vector unsigned char maskj = \
1194  vec_perm(zero, (vector unsigned char)neg1, alignj); \
1195  vector unsigned char dstRi = vec_perm(temp##i, temp##i, aligni); \
1196  vector unsigned char dstRj = vec_perm(temp##j, temp##j, alignj); \
1197  vector unsigned char dstAFi = vec_sel(dstAi, dstRi, maski); \
1198  vector unsigned char dstBFi = vec_sel(dstRi, dstBi, maski); \
1199  vector unsigned char dstAFj = vec_sel(dstAj, dstRj, maskj); \
1200  vector unsigned char dstBFj = vec_sel(dstRj, dstBj, maskj); \
1201  vec_st(dstAFi, i * stride, dst); \
1202  vec_st(dstBFi, i * stride + 16, dst); \
1203  vec_st(dstAFj, j * stride, dst); \
1204  vec_st(dstBFj, j * stride + 16, dst); \
1205 } while (0)
1206 
1207  STORE_DOUBLE_LINE(0,1);
1208  STORE_DOUBLE_LINE(2,3);
1209  STORE_DOUBLE_LINE(4,5);
1210  STORE_DOUBLE_LINE(6,7);
1211 }
COMPARE
#define COMPARE(i)
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
src1
const pixel * src1
Definition: h264pred_template.c:421
data
const char data[16]
Definition: mxf.c:146
STORE_LINE
#define STORE_LINE(i)
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
max
#define max(a, b)
Definition: cuda_runtime.h:33
S
#define S(s, c, i)
Definition: flacdsp_template.c:46
STORE
#define STORE(i)
OP
#define OP(i)
ITER
#define ITER(i, j)
LOAD_DOUBLE_LINE
#define LOAD_DOUBLE_LINE(i, j)
mask
static const uint16_t mask[17]
Definition: lzw.c:38
ACCUMULATE_DIFFS
#define ACCUMULATE_DIFFS(i)
dering_altivec
static void dering_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:533
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
LOAD_LINE
#define LOAD_LINE(i)
LOAD_LINE_ALIGNED
#define LOAD_LINE_ALIGNED(i)
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1016
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:87
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1121
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
COMPUTE_VR
#define COMPUTE_VR(i, j, k)
src2
const pixel * src2
Definition: h264pred_template.c:422
stride
#define stride
Definition: h264pred_template.c:537
av_uninit
#define av_uninit(x)
Definition: attributes.h:154
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
EXTRACT
#define EXTRACT(op)
src0
const pixel *const src0
Definition: h264pred_template.c:420
zero
#define zero
Definition: regdef.h:64
avutil.h
STORE_DOUBLE_LINE
#define STORE_DOUBLE_LINE(i, j)
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
d
d
Definition: ffmpeg_filter.c:156
PACK_AND_STORE_ALIGNED
#define PACK_AND_STORE_ALIGNED(i)
PACK_AND_STORE
#define PACK_AND_STORE(i)
tempNoiseReducer_altivec
static void tempNoiseReducer_altivec(uint8_t *src, int stride, uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
Definition: postprocess_altivec_template.c:829
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
min
float min
Definition: vorbis_enc_data.h:429