FFmpeg
swscale_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 #if HAVE_BIGENDIAN
36 #define vzero vec_splat_s32(0)
37 
38 #define GET_LS(a,b,c,s) {\
39  vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
40  ls = vec_perm(a, l2, c);\
41  a = l2;\
42  }
43 
44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
45  vector signed short ls;\
46  vector signed int vf1, vf2, i1, i2;\
47  GET_LS(l1, x, perm, src);\
48  i1 = vec_mule(filter, ls);\
49  i2 = vec_mulo(filter, ls);\
50  vf1 = vec_mergeh(i1, i2);\
51  vf2 = vec_mergel(i1, i2);\
52  d1 = vec_add(d1, vf1);\
53  d2 = vec_add(d2, vf2);\
54  } while (0)
55 
56 #define LOAD_FILTER(vf,f) {\
57  vector unsigned char perm0 = vec_lvsl(joffset, f);\
58  vf = vec_ld(joffset, f);\
59  vf = vec_perm(vf, vf, perm0);\
60 }
61 #define LOAD_L1(ll1,s,p){\
62  p = vec_lvsl(xoffset, s);\
63  ll1 = vec_ld(xoffset, s);\
64 }
65 
66 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
67 
68 // The neat trick: We only care for half the elements,
69 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
70 // and we're going to use vec_mule, so we choose
71 // carefully how to "unpack" the elements into the even slots.
72 #define GET_VF4(a, vf, f) {\
73  vf = vec_ld(a<< 3, f);\
74  if ((a << 3) % 16)\
75  vf = vec_mergel(vf, (vector signed short)vzero);\
76  else\
77  vf = vec_mergeh(vf, (vector signed short)vzero);\
78 }
79 #define FIRST_LOAD(sv, pos, s, per) {\
80  sv = vec_ld(pos, s);\
81  per = vec_lvsl(pos, s);\
82 }
83 #define UPDATE_PTR(s0, d0, s1, d1) {\
84  d0 = s0;\
85  d1 = s1;\
86 }
87 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
88  v1 = vec_ld(pos + a + 16, s);\
89  vf = vec_perm(v0, v1, per);\
90 }
91 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
92  if ((((uintptr_t)s + pos) % 16) > 8) {\
93  v1 = vec_ld(pos + a + 16, s);\
94  }\
95  vf = vec_perm(v0, src_v1, per);\
96 }
97 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
98  vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
99  vf = vec_perm(vf0, vf1, per);\
100 }
101 
102 #define FUNC(name) name ## _altivec
103 #include "swscale_ppc_template.c"
104 #undef FUNC
105 
106 #undef vzero
107 
108 #endif /* HAVE_BIGENDIAN */
109 
110 #define output_pixel(pos, val, bias, signedness) \
111  if (big_endian) { \
112  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
113  } else { \
114  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
115  }
116 
117 static void
118 yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
119 {
120  static const int big_endian = HAVE_BIGENDIAN;
121  static const int shift = 3;
122  static const float float_mult = 1.0f / 65535.0f;
123  int i, val;
124  uint16_t val_uint;
125 
126  for (i = start; i < dstW; ++i){
127  val = src[i] + (1 << (shift - 1));
128  output_pixel(&val_uint, val, 0, uint);
129  dest[i] = float_mult * (float)val_uint;
130  }
131 }
132 
133 static void
134 yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start)
135 {
136  static const int big_endian = HAVE_BIGENDIAN;
137  static const int shift = 3;
138  static const float float_mult = 1.0f / 65535.0f;
139  int i, val;
140  uint16_t val_uint;
141 
142  for (i = start; i < dstW; ++i){
143  val = src[i] + (1 << (shift - 1));
144  output_pixel(&val_uint, val, 0, uint);
145  dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
146  }
147 }
148 
149 static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
150 {
151  const int dst_u = -(uintptr_t)dest & 3;
152  const int shift = 3;
153  const int add = (1 << (shift - 1));
154  const int clip = (1 << 16) - 1;
155  const float fmult = 1.0f / 65535.0f;
156  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
157  const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
158  const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, clip};
159  const vector float vmul = (vector float) {fmult, fmult, fmult, fmult};
160  const vector float vzero = (vector float) {0, 0, 0, 0};
161  vector uint32_t v;
162  vector float vd;
163  int i;
164 
165  yuv2plane1_float_u(src, dest, dst_u, 0);
166 
167  for (i = dst_u; i < dstW - 3; i += 4) {
168  v = vec_ld(0, (const uint32_t *) &src[i]);
169  v = vec_add(v, vadd);
170  v = vec_sr(v, vshift);
171  v = vec_min(v, vlargest);
172 
173  vd = vec_ctf(v, 0);
174  vd = vec_madd(vd, vmul, vzero);
175 
176  vec_st(vd, 0, &dest[i]);
177  }
178 
179  yuv2plane1_float_u(src, dest, dstW, i);
180 }
181 
182 static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW)
183 {
184  const int dst_u = -(uintptr_t)dest & 3;
185  const int shift = 3;
186  const int add = (1 << (shift - 1));
187  const int clip = (1 << 16) - 1;
188  const float fmult = 1.0f / 65535.0f;
189  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
190  const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
191  const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, clip};
192  const vector float vmul = (vector float) {fmult, fmult, fmult, fmult};
193  const vector float vzero = (vector float) {0, 0, 0, 0};
194  const vector uint32_t vswapbig = (vector uint32_t) {16, 16, 16, 16};
195  const vector uint16_t vswapsmall = vec_splat_u16(8);
196  vector uint32_t v;
197  vector float vd;
198  int i;
199 
200  yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
201 
202  for (i = dst_u; i < dstW - 3; i += 4) {
203  v = vec_ld(0, (const uint32_t *) &src[i]);
204  v = vec_add(v, vadd);
205  v = vec_sr(v, vshift);
206  v = vec_min(v, vlargest);
207 
208  vd = vec_ctf(v, 0);
209  vd = vec_madd(vd, vmul, vzero);
210 
211  vd = (vector float) vec_rl((vector uint32_t) vd, vswapbig);
212  vd = (vector float) vec_rl((vector uint16_t) vd, vswapsmall);
213 
214  vec_st(vd, 0, (float *) &dest[i]);
215  }
216 
217  yuv2plane1_float_bswap_u(src, dest, dstW, i);
218 }
219 
220 #define yuv2plane1_float(template, dest_type, BE_LE) \
221 static void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \
222  int dstW, \
223  const uint8_t *dither, int offset) \
224 { \
225  template((const int32_t *)src, (dest_type *)dest, dstW); \
226 }
227 
228 #if HAVE_BIGENDIAN
229 yuv2plane1_float(yuv2plane1_float_altivec, float, BE)
230 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE)
231 #else
232 yuv2plane1_float(yuv2plane1_float_altivec, float, LE)
233 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE)
234 #endif
235 
236 #endif /* HAVE_ALTIVEC */
237 
239 {
240 #if HAVE_ALTIVEC
241  enum AVPixelFormat dstFormat = c->dstFormat;
242 
244  return;
245 
246 #if HAVE_BIGENDIAN
247  if (c->srcBpc == 8 && c->dstBpc <= 14) {
248  c->hyScale = c->hcScale = hScale_real_altivec;
249  }
250  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
251  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
252  !c->needAlpha) {
253  c->yuv2planeX = yuv2planeX_altivec;
254  }
255 #endif
256 
257  if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
258  c->yuv2plane1 = yuv2plane1_floatBE_altivec;
259  } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
260  c->yuv2plane1 = yuv2plane1_floatLE_altivec;
261  }
262 
263  /* The following list of supported dstFormat values should
264  * match what's found in the body of ff_yuv2packedX_altivec() */
265  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
266  switch (c->dstFormat) {
267  case AV_PIX_FMT_ABGR:
268  c->yuv2packedX = ff_yuv2abgr_X_altivec;
269  break;
270  case AV_PIX_FMT_BGRA:
271  c->yuv2packedX = ff_yuv2bgra_X_altivec;
272  break;
273  case AV_PIX_FMT_ARGB:
274  c->yuv2packedX = ff_yuv2argb_X_altivec;
275  break;
276  case AV_PIX_FMT_RGBA:
277  c->yuv2packedX = ff_yuv2rgba_X_altivec;
278  break;
279  case AV_PIX_FMT_BGR24:
280  c->yuv2packedX = ff_yuv2bgr24_X_altivec;
281  break;
282  case AV_PIX_FMT_RGB24:
283  c->yuv2packedX = ff_yuv2rgb24_X_altivec;
284  break;
285  }
286  }
287 #endif /* HAVE_ALTIVEC */
288 
290 }
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
av_float2int
static av_always_inline uint32_t av_float2int(float f)
Reinterpret a float as a 32-bit integer.
Definition: intfloat.h:50
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:646
SWS_BITEXACT
#define SWS_BITEXACT
Definition: swscale.h:84
start
void INT64 start
Definition: avisynth_c.h:767
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:653
src
#define src
Definition: vp8dsp.c:254
av_bswap32
#define av_bswap32
Definition: bswap.h:33
av_cold
#define av_cold
Definition: attributes.h:84
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
int32_t
int32_t
Definition: audio_convert.c:194
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:685
yuv2plane1_float
yuv2plane1_float(yuv2plane1_float_c_template, yuv2plane1_float(float, LE)
Definition: output.c:304
AV_CPU_FLAG_ALTIVEC
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:60
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
cpu.h
SWS_FULL_CHR_H_INT
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
val
const char const char void * val
Definition: avisynth_c.h:863
output_pixel
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:887
yuv2rgb_altivec.h
attributes.h
ff_sws_init_swscale_vsx
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2072
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
swscale_internal.h
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
ff_sws_init_swscale_ppc
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
Definition: swscale_altivec.c:238
swscale_ppc_template.c
config.h
shift
static int shift(int a, int b)
Definition: sonic.c:82
util_altivec.h
SwsContext
Definition: swscale_internal.h:280
clip
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
swscale.h