FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264qpel_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "config.h"
22 #if HAVE_UNISTD_H
23 #include <unistd.h>
24 #endif
25 
26 #include "libavutil/mem.h"
29 
30 #ifdef DEBUG
31 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
32 #else
33 #define ASSERT_ALIGNED(ptr) ;
34 #endif
35 
36 #if HAVE_BIGENDIAN
37 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
38  vec_u8 srcR1 = vec_ld(-2, s);\
39  vec_u8 srcR2 = vec_ld(14, s);\
40  switch (ali) {\
41  default: {\
42  srcM2 = vec_perm(srcR1, srcR2, pm2);\
43  srcM1 = vec_perm(srcR1, srcR2, pm1);\
44  srcP0 = vec_perm(srcR1, srcR2, pp0);\
45  srcP1 = vec_perm(srcR1, srcR2, pp1);\
46  srcP2 = vec_perm(srcR1, srcR2, pp2);\
47  srcP3 = vec_perm(srcR1, srcR2, pp3);\
48  } break;\
49  case 11: {\
50  srcM2 = vec_perm(srcR1, srcR2, pm2);\
51  srcM1 = vec_perm(srcR1, srcR2, pm1);\
52  srcP0 = vec_perm(srcR1, srcR2, pp0);\
53  srcP1 = vec_perm(srcR1, srcR2, pp1);\
54  srcP2 = vec_perm(srcR1, srcR2, pp2);\
55  srcP3 = srcR2;\
56  } break;\
57  case 12: {\
58  vec_u8 srcR3 = vec_ld(30, s);\
59  srcM2 = vec_perm(srcR1, srcR2, pm2);\
60  srcM1 = vec_perm(srcR1, srcR2, pm1);\
61  srcP0 = vec_perm(srcR1, srcR2, pp0);\
62  srcP1 = vec_perm(srcR1, srcR2, pp1);\
63  srcP2 = srcR2;\
64  srcP3 = vec_perm(srcR2, srcR3, pp3);\
65  } break;\
66  case 13: {\
67  vec_u8 srcR3 = vec_ld(30, s);\
68  srcM2 = vec_perm(srcR1, srcR2, pm2);\
69  srcM1 = vec_perm(srcR1, srcR2, pm1);\
70  srcP0 = vec_perm(srcR1, srcR2, pp0);\
71  srcP1 = srcR2;\
72  srcP2 = vec_perm(srcR2, srcR3, pp2);\
73  srcP3 = vec_perm(srcR2, srcR3, pp3);\
74  } break;\
75  case 14: {\
76  vec_u8 srcR3 = vec_ld(30, s);\
77  srcM2 = vec_perm(srcR1, srcR2, pm2);\
78  srcM1 = vec_perm(srcR1, srcR2, pm1);\
79  srcP0 = srcR2;\
80  srcP1 = vec_perm(srcR2, srcR3, pp1);\
81  srcP2 = vec_perm(srcR2, srcR3, pp2);\
82  srcP3 = vec_perm(srcR2, srcR3, pp3);\
83  } break;\
84  case 15: {\
85  vec_u8 srcR3 = vec_ld(30, s);\
86  srcM2 = vec_perm(srcR1, srcR2, pm2);\
87  srcM1 = srcR2;\
88  srcP0 = vec_perm(srcR2, srcR3, pp0);\
89  srcP1 = vec_perm(srcR2, srcR3, pp1);\
90  srcP2 = vec_perm(srcR2, srcR3, pp2);\
91  srcP3 = vec_perm(srcR2, srcR3, pp3);\
92  } break;\
93  }\
94  }
95 #else
96 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
97  srcM2 = vec_vsx_ld(-2, s);\
98  srcM1 = vec_vsx_ld(-1, s);\
99  srcP0 = vec_vsx_ld(0, s);\
100  srcP1 = vec_vsx_ld(1, s);\
101  srcP2 = vec_vsx_ld(2, s);\
102  srcP3 = vec_vsx_ld(3, s);\
103  }
104 #endif /* HAVE_BIGENDIAN */
105 
106 /* this code assume stride % 16 == 0 */
107 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
108 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
109  const uint8_t *src,
110  int dstStride, int srcStride)
111 {
112  register int i;
113 
114  LOAD_ZERO;
115  vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
116  const vec_s16 v5ss = vec_splat_s16(5);
117  const vec_u16 v5us = vec_splat_u16(5);
118  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
119  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
120 
121  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
122 
123  register int align = ((((unsigned long)src) - 2) % 16);
124 
125  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
126  srcP2A, srcP2B, srcP3A, srcP3B,
127  srcM1A, srcM1B, srcM2A, srcM2B,
128  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
129  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
130  psumA, psumB, sumA, sumB;
131 
132  vec_u8 sum, fsum;
133 
134 #if HAVE_BIGENDIAN
135  permM2 = vec_lvsl(-2, src);
136  permM1 = vec_lvsl(-1, src);
137  permP0 = vec_lvsl(+0, src);
138  permP1 = vec_lvsl(+1, src);
139  permP2 = vec_lvsl(+2, src);
140  permP3 = vec_lvsl(+3, src);
141 #endif /* HAVE_BIGENDIAN */
142 
143  for (i = 0 ; i < 16 ; i ++) {
144  load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
145 
146  srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
147  srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
148  srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
149  srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
150 
151  srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
152  srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
153  srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
154  srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
155 
156  srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
157  srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
158  srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
159  srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
160 
161  sum1A = vec_adds(srcP0A, srcP1A);
162  sum1B = vec_adds(srcP0B, srcP1B);
163  sum2A = vec_adds(srcM1A, srcP2A);
164  sum2B = vec_adds(srcM1B, srcP2B);
165  sum3A = vec_adds(srcM2A, srcP3A);
166  sum3B = vec_adds(srcM2B, srcP3B);
167 
168  pp1A = vec_mladd(sum1A, v20ss, v16ss);
169  pp1B = vec_mladd(sum1B, v20ss, v16ss);
170 
171  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
172  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
173 
174  pp3A = vec_add(sum3A, pp1A);
175  pp3B = vec_add(sum3B, pp1B);
176 
177  psumA = vec_sub(pp3A, pp2A);
178  psumB = vec_sub(pp3B, pp2B);
179 
180  sumA = vec_sra(psumA, v5us);
181  sumB = vec_sra(psumB, v5us);
182 
183  sum = vec_packsu(sumA, sumB);
184 
185  ASSERT_ALIGNED(dst);
186 
187  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
188 
189  vec_st(fsum, 0, dst);
190 
191  src += srcStride;
192  dst += dstStride;
193  }
194 }
195 #endif
196 
197 /* this code assume stride % 16 == 0 */
198 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
199 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
200  const uint8_t *src,
201  int dstStride, int srcStride)
202 {
203  register int i;
204 
205  LOAD_ZERO;
206  vec_u8 perm;
207 #if HAVE_BIGENDIAN
208  perm = vec_lvsl(0, src);
209 #endif
210  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
211  const vec_u16 v5us = vec_splat_u16(5);
212  const vec_s16 v5ss = vec_splat_s16(5);
213  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
214 
215  const uint8_t *srcbis = src - (srcStride * 2);
216 
217  const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
218  srcbis += srcStride;
219  const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
220  srcbis += srcStride;
221  const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
222  srcbis += srcStride;
223  const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
224  srcbis += srcStride;
225  const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
226  srcbis += srcStride;
227 
228  vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
229  vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
230  vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
231  vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
232  vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
233  vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
234  vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
235  vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
236  vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
237  vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
238 
239  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
240  psumA, psumB, sumA, sumB,
241  srcP3ssA, srcP3ssB,
242  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
243 
244  vec_u8 sum, fsum, srcP3;
245 
246  for (i = 0 ; i < 16 ; i++) {
247  srcP3 = load_with_perm_vec(0, srcbis, perm);
248  srcbis += srcStride;
249 
250  srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
251  srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
252 
253  sum1A = vec_adds(srcP0ssA, srcP1ssA);
254  sum1B = vec_adds(srcP0ssB, srcP1ssB);
255  sum2A = vec_adds(srcM1ssA, srcP2ssA);
256  sum2B = vec_adds(srcM1ssB, srcP2ssB);
257  sum3A = vec_adds(srcM2ssA, srcP3ssA);
258  sum3B = vec_adds(srcM2ssB, srcP3ssB);
259 
260  srcM2ssA = srcM1ssA;
261  srcM2ssB = srcM1ssB;
262  srcM1ssA = srcP0ssA;
263  srcM1ssB = srcP0ssB;
264  srcP0ssA = srcP1ssA;
265  srcP0ssB = srcP1ssB;
266  srcP1ssA = srcP2ssA;
267  srcP1ssB = srcP2ssB;
268  srcP2ssA = srcP3ssA;
269  srcP2ssB = srcP3ssB;
270 
271  pp1A = vec_mladd(sum1A, v20ss, v16ss);
272  pp1B = vec_mladd(sum1B, v20ss, v16ss);
273 
274  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
275  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
276 
277  pp3A = vec_add(sum3A, pp1A);
278  pp3B = vec_add(sum3B, pp1B);
279 
280  psumA = vec_sub(pp3A, pp2A);
281  psumB = vec_sub(pp3B, pp2B);
282 
283  sumA = vec_sra(psumA, v5us);
284  sumB = vec_sra(psumB, v5us);
285 
286  sum = vec_packsu(sumA, sumB);
287 
288  ASSERT_ALIGNED(dst);
289 
290  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
291 
292  vec_st(fsum, 0, dst);
293 
294  dst += dstStride;
295  }
296 }
297 #endif
298 
299 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
300 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
301 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
302  const uint8_t *src,
303  int dstStride, int tmpStride,
304  int srcStride)
305 {
306  register int i;
307  LOAD_ZERO;
308  vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
309  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310  const vec_u32 v10ui = vec_splat_u32(10);
311  const vec_s16 v5ss = vec_splat_s16(5);
312  const vec_s16 v1ss = vec_splat_s16(1);
313  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
314  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
315 
316  register int align = ((((unsigned long)src) - 2) % 16);
317 
318  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319  srcP2A, srcP2B, srcP3A, srcP3B,
320  srcM1A, srcM1B, srcM2A, srcM2B,
321  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
323 
324  const vec_u8 mperm = (const vec_u8)
325  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
326  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
327  int16_t *tmpbis = tmp;
328 
329  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
330  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
331  tmpP2ssA, tmpP2ssB;
332 
333  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
334  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
335  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
336  ssumAe, ssumAo, ssumBe, ssumBo;
337  vec_u8 fsum, sumv, sum;
338  vec_s16 ssume, ssumo;
339 
340 #if HAVE_BIGENDIAN
341  permM2 = vec_lvsl(-2, src);
342  permM1 = vec_lvsl(-1, src);
343  permP0 = vec_lvsl(+0, src);
344  permP1 = vec_lvsl(+1, src);
345  permP2 = vec_lvsl(+2, src);
346  permP3 = vec_lvsl(+3, src);
347 #endif /* HAVE_BIGENDIAN */
348 
349  src -= (2 * srcStride);
350  for (i = 0 ; i < 21 ; i ++) {
351  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
352 
353  load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
354 
355  srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
356  srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
357  srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
358  srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
359 
360  srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
361  srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
362  srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
363  srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
364 
365  srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
366  srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
367  srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
368  srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
369 
370  sum1A = vec_adds(srcP0A, srcP1A);
371  sum1B = vec_adds(srcP0B, srcP1B);
372  sum2A = vec_adds(srcM1A, srcP2A);
373  sum2B = vec_adds(srcM1B, srcP2B);
374  sum3A = vec_adds(srcM2A, srcP3A);
375  sum3B = vec_adds(srcM2B, srcP3B);
376 
377  pp1A = vec_mladd(sum1A, v20ss, sum3A);
378  pp1B = vec_mladd(sum1B, v20ss, sum3B);
379 
380  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
381  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
382 
383  psumA = vec_sub(pp1A, pp2A);
384  psumB = vec_sub(pp1B, pp2B);
385 
386  vec_st(psumA, 0, tmp);
387  vec_st(psumB, 16, tmp);
388 
389  src += srcStride;
390  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
391  }
392 
393  tmpM2ssA = vec_ld(0, tmpbis);
394  tmpM2ssB = vec_ld(16, tmpbis);
395  tmpbis += tmpStride;
396  tmpM1ssA = vec_ld(0, tmpbis);
397  tmpM1ssB = vec_ld(16, tmpbis);
398  tmpbis += tmpStride;
399  tmpP0ssA = vec_ld(0, tmpbis);
400  tmpP0ssB = vec_ld(16, tmpbis);
401  tmpbis += tmpStride;
402  tmpP1ssA = vec_ld(0, tmpbis);
403  tmpP1ssB = vec_ld(16, tmpbis);
404  tmpbis += tmpStride;
405  tmpP2ssA = vec_ld(0, tmpbis);
406  tmpP2ssB = vec_ld(16, tmpbis);
407  tmpbis += tmpStride;
408 
409  for (i = 0 ; i < 16 ; i++) {
410  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
411  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
412 
413  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
414  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
415  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
416  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
417  vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
418  vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
419 
420  tmpbis += tmpStride;
421 
422  tmpM2ssA = tmpM1ssA;
423  tmpM2ssB = tmpM1ssB;
424  tmpM1ssA = tmpP0ssA;
425  tmpM1ssB = tmpP0ssB;
426  tmpP0ssA = tmpP1ssA;
427  tmpP0ssB = tmpP1ssB;
428  tmpP1ssA = tmpP2ssA;
429  tmpP1ssB = tmpP2ssB;
430  tmpP2ssA = tmpP3ssA;
431  tmpP2ssB = tmpP3ssB;
432 
433  pp1Ae = vec_mule(sum1A, v20ss);
434  pp1Ao = vec_mulo(sum1A, v20ss);
435  pp1Be = vec_mule(sum1B, v20ss);
436  pp1Bo = vec_mulo(sum1B, v20ss);
437 
438  pp2Ae = vec_mule(sum2A, v5ss);
439  pp2Ao = vec_mulo(sum2A, v5ss);
440  pp2Be = vec_mule(sum2B, v5ss);
441  pp2Bo = vec_mulo(sum2B, v5ss);
442 
443  pp3Ao = vec_mulo(sum3A, v1ss);
444  pp3Bo = vec_mulo(sum3B, v1ss);
445 #if !HAVE_BIGENDIAN
446  sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
447  sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
448 #endif
449  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
450  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
451 
452  pp1cAe = vec_add(pp1Ae, v512si);
453  pp1cAo = vec_add(pp1Ao, v512si);
454  pp1cBe = vec_add(pp1Be, v512si);
455  pp1cBo = vec_add(pp1Bo, v512si);
456 
457  pp32Ae = vec_sub(pp3Ae, pp2Ae);
458  pp32Ao = vec_sub(pp3Ao, pp2Ao);
459  pp32Be = vec_sub(pp3Be, pp2Be);
460  pp32Bo = vec_sub(pp3Bo, pp2Bo);
461 
462  sumAe = vec_add(pp1cAe, pp32Ae);
463  sumAo = vec_add(pp1cAo, pp32Ao);
464  sumBe = vec_add(pp1cBe, pp32Be);
465  sumBo = vec_add(pp1cBo, pp32Bo);
466 
467  ssumAe = vec_sra(sumAe, v10ui);
468  ssumAo = vec_sra(sumAo, v10ui);
469  ssumBe = vec_sra(sumBe, v10ui);
470  ssumBo = vec_sra(sumBo, v10ui);
471 
472  ssume = vec_packs(ssumAe, ssumBe);
473  ssumo = vec_packs(ssumAo, ssumBo);
474 
475  sumv = vec_packsu(ssume, ssumo);
476  sum = vec_perm(sumv, sumv, mperm);
477 
478  ASSERT_ALIGNED(dst);
479 
480  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
481 
482  vec_st(fsum, 0, dst);
483 
484  dst += dstStride;
485  }
486 }
487 #endif