FFmpeg
vp8dsp_altivec.c
Go to the documentation of this file.
1 /*
2  * VP8 compatible video decoder
3  *
4  * Copyright (C) 2010 David Conrad
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/ppc/cpu.h"
30 
31 #include "libavcodec/vp8dsp.h"
32 
33 #include "hpeldsp_altivec.h"
34 
35 #if HAVE_ALTIVEC
36 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
37 
38 // h subpel filter uses msum to multiply+add 4 pixel taps at once
39 static const vec_s8 h_subpel_filters_inner[7] =
40 {
41  REPT4( -6, 123, 12, -1),
42  REPT4(-11, 108, 36, -8),
43  REPT4( -9, 93, 50, -6),
44  REPT4(-16, 77, 77, -16),
45  REPT4( -6, 50, 93, -9),
46  REPT4( -8, 36, 108, -11),
47  REPT4( -1, 12, 123, -6),
48 };
49 
50 // for 6tap filters, these are the outer two taps
51 // The zeros mask off pixels 4-7 when filtering 0-3
52 // and vice-versa
53 static const vec_s8 h_subpel_filters_outer[3] =
54 {
55  REPT4(0, 0, 2, 1),
56  REPT4(0, 0, 3, 3),
57  REPT4(0, 0, 1, 2),
58 };
59 
60 #define LOAD_H_SUBPEL_FILTER(i) \
61  vec_s8 filter_inner = h_subpel_filters_inner[i]; \
62  vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
63  vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
64 
65 #if HAVE_BIGENDIAN
66 #define GET_PIXHL(offset) \
67  a = vec_ld((offset)-is6tap-1, src); \
68  b = vec_ld((offset)-is6tap-1+15, src); \
69  pixh = vec_perm(a, b, permh##offset); \
70  pixl = vec_perm(a, b, perml##offset)
71 
72 #define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
73 #else
74 #define GET_PIXHL(offset) \
75  a = vec_vsx_ld((offset)-is6tap-1, src); \
76  pixh = vec_perm(a, a, perm_inner); \
77  pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
78 
79 #define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
80 #endif
81 
82 #define FILTER_H(dstv, off) \
83  GET_PIXHL(off); \
84  filth = vec_msum(filter_inner, pixh, c64); \
85  filtl = vec_msum(filter_inner, pixl, c64); \
86 \
87  if (is6tap) { \
88  GET_OUTER(off); \
89  filth = vec_msum(filter_outerh, outer, filth); \
90  filtl = vec_msum(filter_outerl, outer, filtl); \
91  } \
92  if (w == 4) \
93  filtl = filth; /* discard pixels 4-7 */ \
94  dstv = vec_packs(filth, filtl); \
95  dstv = vec_sra(dstv, c7)
96 
97 static av_always_inline
98 void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
99  const uint8_t *src, ptrdiff_t src_stride,
100  int h, int mx, int w, int is6tap)
101 {
102  LOAD_H_SUBPEL_FILTER(mx-1);
103 #if HAVE_BIGENDIAN
104  vec_u8 align_vec0, align_vec8, permh0, permh8;
105  vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
106  vec_u8 b;
107 #endif
108  vec_u8 filt, a, pixh, pixl, outer;
109  vec_s16 f16h, f16l;
110  vec_s32 filth, filtl;
111 
112  vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
113  vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
114  vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
115  vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
116  vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
117  vec_u16 c7 = vec_splat_u16(7);
118 
119 #if HAVE_BIGENDIAN
120  align_vec0 = vec_lvsl( -is6tap-1, src);
121  align_vec8 = vec_lvsl(8-is6tap-1, src);
122 
123  permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
124  permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
125  perm_inner = vec_add(perm_inner, vec_splat_u8(4));
126  perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
127  perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
128  perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
129  perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
130 #endif
131 
132  while (h --> 0) {
133  FILTER_H(f16h, 0);
134 
135  if (w == 16) {
136  FILTER_H(f16l, 8);
137  filt = vec_packsu(f16h, f16l);
138  vec_st(filt, 0, dst);
139  } else {
140  filt = vec_packsu(f16h, f16h);
141  vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
142  if (w == 8)
143  vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
144  }
145  src += src_stride;
146  dst += dst_stride;
147  }
148 }
149 
150 // v subpel filter does a simple vertical multiply + add
151 static const vec_u8 v_subpel_filters[7] =
152 {
153  { 0, 6, 123, 12, 1, 0 },
154  { 2, 11, 108, 36, 8, 1 },
155  { 0, 9, 93, 50, 6, 0 },
156  { 3, 16, 77, 77, 16, 3 },
157  { 0, 6, 50, 93, 9, 0 },
158  { 1, 8, 36, 108, 11, 2 },
159  { 0, 1, 12, 123, 6, 0 },
160 };
161 
162 #define LOAD_V_SUBPEL_FILTER(i) \
163  vec_u8 subpel_filter = v_subpel_filters[i]; \
164  vec_u8 f0 = vec_splat(subpel_filter, 0); \
165  vec_u8 f1 = vec_splat(subpel_filter, 1); \
166  vec_u8 f2 = vec_splat(subpel_filter, 2); \
167  vec_u8 f3 = vec_splat(subpel_filter, 3); \
168  vec_u8 f4 = vec_splat(subpel_filter, 4); \
169  vec_u8 f5 = vec_splat(subpel_filter, 5)
170 
171 #define FILTER_V(dstv, vec_mul) \
172  s1f = (vec_s16)vec_mul(s1, f1); \
173  s2f = (vec_s16)vec_mul(s2, f2); \
174  s3f = (vec_s16)vec_mul(s3, f3); \
175  s4f = (vec_s16)vec_mul(s4, f4); \
176  s2f = vec_subs(s2f, s1f); \
177  s3f = vec_subs(s3f, s4f); \
178  if (is6tap) { \
179  s0f = (vec_s16)vec_mul(s0, f0); \
180  s5f = (vec_s16)vec_mul(s5, f5); \
181  s2f = vec_adds(s2f, s0f); \
182  s3f = vec_adds(s3f, s5f); \
183  } \
184  dstv = vec_adds(s2f, s3f); \
185  dstv = vec_adds(dstv, c64); \
186  dstv = vec_sra(dstv, c7)
187 
188 #if HAVE_BIGENDIAN
189 #define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
190 #else
191 #define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
192 #endif
193 
194 static av_always_inline
195 void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
196  const uint8_t *src, ptrdiff_t src_stride,
197  int h, int my, int w, int is6tap)
198 {
199  LOAD_V_SUBPEL_FILTER(my-1);
200  vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
201  vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
202  vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
203  vec_u16 c7 = vec_splat_u16(7);
204 
205 #if HAVE_BIGENDIAN
206  // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
207  // so combine this permute with the alignment permute vector
208  align_vech = vec_lvsl(0, src);
209  align_vecl = vec_sld(align_vech, align_vech, 8);
210  if (w ==16)
211  perm_vec = vec_mergeh(align_vech, align_vecl);
212  else
213  perm_vec = vec_mergeh(align_vech, align_vech);
214 #endif
215 
216  if (is6tap)
217  s0 = LOAD_HL(-2*src_stride, src, perm_vec);
218  s1 = LOAD_HL(-1*src_stride, src, perm_vec);
219  s2 = LOAD_HL( 0*src_stride, src, perm_vec);
220  s3 = LOAD_HL( 1*src_stride, src, perm_vec);
221  if (is6tap)
222  s4 = LOAD_HL( 2*src_stride, src, perm_vec);
223 
224  src += (2+is6tap)*src_stride;
225 
226  while (h --> 0) {
227  if (is6tap)
228  s5 = LOAD_HL(0, src, perm_vec);
229  else
230  s4 = LOAD_HL(0, src, perm_vec);
231 
232  FILTER_V(f16h, vec_mule);
233 
234  if (w == 16) {
235  FILTER_V(f16l, vec_mulo);
236  filt = vec_packsu(f16h, f16l);
237  vec_st(filt, 0, dst);
238  } else {
239  filt = vec_packsu(f16h, f16h);
240  if (w == 4)
241  filt = (vec_u8)vec_splat((vec_u32)filt, 0);
242  else
243  vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
244  vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
245  }
246 
247  if (is6tap)
248  s0 = s1;
249  s1 = s2;
250  s2 = s3;
251  s3 = s4;
252  if (is6tap)
253  s4 = s5;
254 
255  dst += dst_stride;
256  src += src_stride;
257  }
258 }
259 
260 #define EPEL_FUNCS(WIDTH, TAPS) \
261 static av_noinline \
262 void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
263 { \
264  put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
265 } \
266 \
267 static av_noinline \
268 void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
269 { \
270  put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
271 }
272 
273 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \
274 static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, const uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
275 { \
276  DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
277  if (VTAPS == 6) { \
278  put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \
279  put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \
280  } else { \
281  put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \
282  put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \
283  } \
284 }
285 
286 EPEL_FUNCS(16,6)
287 EPEL_FUNCS(8, 6)
288 EPEL_FUNCS(8, 4)
289 EPEL_FUNCS(4, 6)
290 EPEL_FUNCS(4, 4)
291 
292 EPEL_HV(16, 6,6)
293 EPEL_HV(8, 6,6)
294 EPEL_HV(8, 4,6)
295 EPEL_HV(8, 6,4)
296 EPEL_HV(8, 4,4)
297 EPEL_HV(4, 6,6)
298 EPEL_HV(4, 4,6)
299 EPEL_HV(4, 6,4)
300 EPEL_HV(4, 4,4)
301 
302 static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, const uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
303 {
304  register vector unsigned char perm;
305  int i;
306  register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
307  register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
308  register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
309 
310 #if HAVE_BIGENDIAN
311  perm = vec_lvsl(0, src);
312 #endif
313 // hand-unrolling the loop by 4 gains about 15%
314 // mininum execution time goes from 74 to 60 cycles
315 // it's faster than -funroll-loops, but using
316 // -funroll-loops w/ this is bad - 74 cycles again.
317 // all this is on a 7450, tuning for the 7450
318  for (i = 0; i < h; i += 4) {
319  vec_st(load_with_perm_vec(0, src, perm), 0, dst);
320  vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
321  vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
322  vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
323  src += sstride4;
324  dst += dstride4;
325  }
326 }
327 
328 #endif /* HAVE_ALTIVEC */
329 
330 
332 {
333 #if HAVE_ALTIVEC
335  return;
336 
337  c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
338  c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
339  c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
340  c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
341 
342  c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
343  c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
344  c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
345  c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
346 
347  c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
348  c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
349  c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
350  c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
351 
352  c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
353  c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
354  c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
355  c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
356 
357  c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
358  c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
359  c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
360  c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
361 #endif /* HAVE_ALTIVEC */
362 }
s5
#define s5
Definition: regdef.h:42
mem_internal.h
vec_s8
#define vec_s8
Definition: util_altivec.h:35
w
uint8_t w
Definition: llviddspenc.c:38
b
#define b
Definition: input.c:41
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
s3
#define s3
Definition: regdef.h:40
perm
perm
Definition: f_perms.c:75
vec_s32
#define vec_s32
Definition: util_altivec.h:39
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
vp8dsp.h
s1
#define s1
Definition: regdef.h:38
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
VP8DSPContext
Definition: vp8dsp.h:37
s2
#define s2
Definition: regdef.h:39
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
vec_u32
#define vec_u32
Definition: util_altivec.h:38
cpu.h
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
vec_u8
#define vec_u8
Definition: util_altivec.h:34
attributes.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
av_always_inline
#define av_always_inline
Definition: attributes.h:49
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
s4
#define s4
Definition: regdef.h:41
hpeldsp_altivec.h
ff_vp78dsp_init_ppc
av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
Definition: vp8dsp_altivec.c:331
s0
#define s0
Definition: regdef.h:37
util_altivec.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
cpu.h
h
h
Definition: vp9dsp_template.c:2038
EPEL_FUNCS
#define EPEL_FUNCS(depth)
vec_u16
#define vec_u16
Definition: util_altivec.h:36