FFmpeg
vp8dsp_altivec.c
Go to the documentation of this file.
1 /*
2  * VP8 compatible video decoder
3  *
4  * Copyright (C) 2010 David Conrad
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/cpu.h"
26 #include "libavutil/mem_internal.h"
27 #include "libavutil/ppc/cpu.h"
29 
30 #include "libavcodec/vp8dsp.h"
31 
32 #include "hpeldsp_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
36 
37 // h subpel filter uses msum to multiply+add 4 pixel taps at once
38 static const vec_s8 h_subpel_filters_inner[7] =
39 {
40  REPT4( -6, 123, 12, -1),
41  REPT4(-11, 108, 36, -8),
42  REPT4( -9, 93, 50, -6),
43  REPT4(-16, 77, 77, -16),
44  REPT4( -6, 50, 93, -9),
45  REPT4( -8, 36, 108, -11),
46  REPT4( -1, 12, 123, -6),
47 };
48 
49 // for 6tap filters, these are the outer two taps
50 // The zeros mask off pixels 4-7 when filtering 0-3
51 // and vice-versa
52 static const vec_s8 h_subpel_filters_outer[3] =
53 {
54  REPT4(0, 0, 2, 1),
55  REPT4(0, 0, 3, 3),
56  REPT4(0, 0, 1, 2),
57 };
58 
59 #define LOAD_H_SUBPEL_FILTER(i) \
60  vec_s8 filter_inner = h_subpel_filters_inner[i]; \
61  vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
62  vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
63 
64 #if HAVE_BIGENDIAN
65 #define GET_PIXHL(offset) \
66  a = vec_ld((offset)-is6tap-1, src); \
67  b = vec_ld((offset)-is6tap-1+15, src); \
68  pixh = vec_perm(a, b, permh##offset); \
69  pixl = vec_perm(a, b, perml##offset)
70 
71 #define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
72 #else
73 #define GET_PIXHL(offset) \
74  a = vec_vsx_ld((offset)-is6tap-1, src); \
75  pixh = vec_perm(a, a, perm_inner); \
76  pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
77 
78 #define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
79 #endif
80 
81 #define FILTER_H(dstv, off) \
82  GET_PIXHL(off); \
83  filth = vec_msum(filter_inner, pixh, c64); \
84  filtl = vec_msum(filter_inner, pixl, c64); \
85 \
86  if (is6tap) { \
87  GET_OUTER(off); \
88  filth = vec_msum(filter_outerh, outer, filth); \
89  filtl = vec_msum(filter_outerl, outer, filtl); \
90  } \
91  if (w == 4) \
92  filtl = filth; /* discard pixels 4-7 */ \
93  dstv = vec_packs(filth, filtl); \
94  dstv = vec_sra(dstv, c7)
95 
96 static av_always_inline
97 void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
98  uint8_t *src, ptrdiff_t src_stride,
99  int h, int mx, int w, int is6tap)
100 {
101  LOAD_H_SUBPEL_FILTER(mx-1);
102 #if HAVE_BIGENDIAN
103  vec_u8 align_vec0, align_vec8, permh0, permh8;
104  vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
105  vec_u8 b;
106 #endif
107  vec_u8 filt, a, pixh, pixl, outer;
108  vec_s16 f16h, f16l;
109  vec_s32 filth, filtl;
110 
111  vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
112  vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
113  vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
114  vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
115  vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
116  vec_u16 c7 = vec_splat_u16(7);
117 
118 #if HAVE_BIGENDIAN
119  align_vec0 = vec_lvsl( -is6tap-1, src);
120  align_vec8 = vec_lvsl(8-is6tap-1, src);
121 
122  permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
123  permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
124  perm_inner = vec_add(perm_inner, vec_splat_u8(4));
125  perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
126  perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
127  perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
128  perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
129 #endif
130 
131  while (h --> 0) {
132  FILTER_H(f16h, 0);
133 
134  if (w == 16) {
135  FILTER_H(f16l, 8);
136  filt = vec_packsu(f16h, f16l);
137  vec_st(filt, 0, dst);
138  } else {
139  filt = vec_packsu(f16h, f16h);
140  vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
141  if (w == 8)
142  vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
143  }
144  src += src_stride;
145  dst += dst_stride;
146  }
147 }
148 
149 // v subpel filter does a simple vertical multiply + add
150 static const vec_u8 v_subpel_filters[7] =
151 {
152  { 0, 6, 123, 12, 1, 0 },
153  { 2, 11, 108, 36, 8, 1 },
154  { 0, 9, 93, 50, 6, 0 },
155  { 3, 16, 77, 77, 16, 3 },
156  { 0, 6, 50, 93, 9, 0 },
157  { 1, 8, 36, 108, 11, 2 },
158  { 0, 1, 12, 123, 6, 0 },
159 };
160 
161 #define LOAD_V_SUBPEL_FILTER(i) \
162  vec_u8 subpel_filter = v_subpel_filters[i]; \
163  vec_u8 f0 = vec_splat(subpel_filter, 0); \
164  vec_u8 f1 = vec_splat(subpel_filter, 1); \
165  vec_u8 f2 = vec_splat(subpel_filter, 2); \
166  vec_u8 f3 = vec_splat(subpel_filter, 3); \
167  vec_u8 f4 = vec_splat(subpel_filter, 4); \
168  vec_u8 f5 = vec_splat(subpel_filter, 5)
169 
170 #define FILTER_V(dstv, vec_mul) \
171  s1f = (vec_s16)vec_mul(s1, f1); \
172  s2f = (vec_s16)vec_mul(s2, f2); \
173  s3f = (vec_s16)vec_mul(s3, f3); \
174  s4f = (vec_s16)vec_mul(s4, f4); \
175  s2f = vec_subs(s2f, s1f); \
176  s3f = vec_subs(s3f, s4f); \
177  if (is6tap) { \
178  s0f = (vec_s16)vec_mul(s0, f0); \
179  s5f = (vec_s16)vec_mul(s5, f5); \
180  s2f = vec_adds(s2f, s0f); \
181  s3f = vec_adds(s3f, s5f); \
182  } \
183  dstv = vec_adds(s2f, s3f); \
184  dstv = vec_adds(dstv, c64); \
185  dstv = vec_sra(dstv, c7)
186 
187 #if HAVE_BIGENDIAN
188 #define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
189 #else
190 #define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
191 #endif
192 
193 static av_always_inline
194 void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
195  uint8_t *src, ptrdiff_t src_stride,
196  int h, int my, int w, int is6tap)
197 {
198  LOAD_V_SUBPEL_FILTER(my-1);
199  vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
200  vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
201  vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
202  vec_u16 c7 = vec_splat_u16(7);
203 
204 #if HAVE_BIGENDIAN
205  // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
206  // so combine this permute with the alignment permute vector
207  align_vech = vec_lvsl(0, src);
208  align_vecl = vec_sld(align_vech, align_vech, 8);
209  if (w ==16)
210  perm_vec = vec_mergeh(align_vech, align_vecl);
211  else
212  perm_vec = vec_mergeh(align_vech, align_vech);
213 #endif
214 
215  if (is6tap)
216  s0 = LOAD_HL(-2*src_stride, src, perm_vec);
217  s1 = LOAD_HL(-1*src_stride, src, perm_vec);
218  s2 = LOAD_HL( 0*src_stride, src, perm_vec);
219  s3 = LOAD_HL( 1*src_stride, src, perm_vec);
220  if (is6tap)
221  s4 = LOAD_HL( 2*src_stride, src, perm_vec);
222 
223  src += (2+is6tap)*src_stride;
224 
225  while (h --> 0) {
226  if (is6tap)
227  s5 = LOAD_HL(0, src, perm_vec);
228  else
229  s4 = LOAD_HL(0, src, perm_vec);
230 
231  FILTER_V(f16h, vec_mule);
232 
233  if (w == 16) {
234  FILTER_V(f16l, vec_mulo);
235  filt = vec_packsu(f16h, f16l);
236  vec_st(filt, 0, dst);
237  } else {
238  filt = vec_packsu(f16h, f16h);
239  if (w == 4)
240  filt = (vec_u8)vec_splat((vec_u32)filt, 0);
241  else
242  vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
243  vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
244  }
245 
246  if (is6tap)
247  s0 = s1;
248  s1 = s2;
249  s2 = s3;
250  s3 = s4;
251  if (is6tap)
252  s4 = s5;
253 
254  dst += dst_stride;
255  src += src_stride;
256  }
257 }
258 
259 #define EPEL_FUNCS(WIDTH, TAPS) \
260 static av_noinline \
261 void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
262 { \
263  put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
264 } \
265 \
266 static av_noinline \
267 void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
268 { \
269  put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
270 }
271 
272 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \
273 static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
274 { \
275  DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
276  if (VTAPS == 6) { \
277  put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \
278  put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \
279  } else { \
280  put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \
281  put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \
282  } \
283 }
284 
285 EPEL_FUNCS(16,6)
286 EPEL_FUNCS(8, 6)
287 EPEL_FUNCS(8, 4)
288 EPEL_FUNCS(4, 6)
289 EPEL_FUNCS(4, 4)
290 
291 EPEL_HV(16, 6,6)
292 EPEL_HV(8, 6,6)
293 EPEL_HV(8, 4,6)
294 EPEL_HV(8, 6,4)
295 EPEL_HV(8, 4,4)
296 EPEL_HV(4, 6,6)
297 EPEL_HV(4, 4,6)
298 EPEL_HV(4, 6,4)
299 EPEL_HV(4, 4,4)
300 
301 static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
302 {
303  register vector unsigned char perm;
304  int i;
305  register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
306  register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
307  register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
308 
309 #if HAVE_BIGENDIAN
310  perm = vec_lvsl(0, src);
311 #endif
312 // hand-unrolling the loop by 4 gains about 15%
313 // mininum execution time goes from 74 to 60 cycles
314 // it's faster than -funroll-loops, but using
315 // -funroll-loops w/ this is bad - 74 cycles again.
316 // all this is on a 7450, tuning for the 7450
317  for (i = 0; i < h; i += 4) {
318  vec_st(load_with_perm_vec(0, src, perm), 0, dst);
319  vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
320  vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
321  vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
322  src += sstride4;
323  dst += dstride4;
324  }
325 }
326 
327 #endif /* HAVE_ALTIVEC */
328 
329 
331 {
332 #if HAVE_ALTIVEC
334  return;
335 
336  c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
337  c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
338  c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
339  c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
340 
341  c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
342  c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
343  c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
344  c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
345 
346  c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
347  c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
348  c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
349  c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
350 
351  c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
352  c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
353  c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
354  c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
355 
356  c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
357  c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
358  c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
359  c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
360 #endif /* HAVE_ALTIVEC */
361 }
s5
#define s5
Definition: regdef.h:42
mem_internal.h
vec_s8
#define vec_s8
Definition: util_altivec.h:35
w
uint8_t w
Definition: llviddspenc.c:39
b
#define b
Definition: input.c:41
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:95
s3
#define s3
Definition: regdef.h:40
perm
perm
Definition: f_perms.c:74
vec_s32
#define vec_s32
Definition: util_altivec.h:39
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
vp8dsp.h
s1
#define s1
Definition: regdef.h:38
src
#define src
Definition: vp8dsp.c:255
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
VP8DSPContext
Definition: vp8dsp.h:37
s2
#define s2
Definition: regdef.h:39
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
vec_u32
#define vec_u32
Definition: util_altivec.h:38
cpu.h
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
vec_u8
#define vec_u8
Definition: util_altivec.h:34
i
int i
Definition: input.c:407
av_always_inline
#define av_always_inline
Definition: attributes.h:49
uint8_t
uint8_t
Definition: audio_convert.c:194
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
s4
#define s4
Definition: regdef.h:41
hpeldsp_altivec.h
ff_vp78dsp_init_ppc
av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
Definition: vp8dsp_altivec.c:330
s0
#define s0
Definition: regdef.h:37
util_altivec.h
cpu.h
h
h
Definition: vp9dsp_template.c:2038
EPEL_FUNCS
#define EPEL_FUNCS(depth)
vec_u16
#define vec_u16
Definition: util_altivec.h:36