FFmpeg
swscale_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 #if HAVE_BIGENDIAN
36 #define vzero vec_splat_s32(0)
37 
38 #define GET_LS(a,b,c,s) {\
39  vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
40  ls = vec_perm(a, l2, c);\
41  a = l2;\
42  }
43 
44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
45  vector signed short ls;\
46  vector signed int vf1, vf2, i1, i2;\
47  GET_LS(l1, x, perm, src);\
48  i1 = vec_mule(filter, ls);\
49  i2 = vec_mulo(filter, ls);\
50  vf1 = vec_mergeh(i1, i2);\
51  vf2 = vec_mergel(i1, i2);\
52  d1 = vec_add(d1, vf1);\
53  d2 = vec_add(d2, vf2);\
54  } while (0)
55 
56 #define LOAD_FILTER(vf,f) {\
57  vector unsigned char perm0 = vec_lvsl(joffset, f);\
58  vf = vec_ld(joffset, f);\
59  vf = vec_perm(vf, vf, perm0);\
60 }
61 #define LOAD_L1(ll1,s,p){\
62  p = vec_lvsl(xoffset, s);\
63  ll1 = vec_ld(xoffset, s);\
64 }
65 
66 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
67 
68 // The neat trick: We only care for half the elements,
69 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
70 // and we're going to use vec_mule, so we choose
71 // carefully how to "unpack" the elements into the even slots.
72 #define GET_VF4(a, vf, f) {\
73  vf = vec_ld(a<< 3, f);\
74  if ((a << 3) % 16)\
75  vf = vec_mergel(vf, (vector signed short)vzero);\
76  else\
77  vf = vec_mergeh(vf, (vector signed short)vzero);\
78 }
79 #define FIRST_LOAD(sv, pos, s, per) {\
80  sv = vec_ld(pos, s);\
81  per = vec_lvsl(pos, s);\
82 }
83 #define UPDATE_PTR(s0, d0, s1, d1) {\
84  d0 = s0;\
85  d1 = s1;\
86 }
87 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
88  v1 = vec_ld(pos + a + 16, s);\
89  vf = vec_perm(v0, v1, per);\
90 }
91 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
92  if ((((uintptr_t)s + pos) % 16) > 8) {\
93  v1 = vec_ld(pos + a + 16, s);\
94  }\
95  vf = vec_perm(v0, src_v1, per);\
96 }
97 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
98  vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
99  vf = vec_perm(vf0, vf1, per);\
100 }
101 
102 #define FUNC(name) name ## _altivec
103 #include "swscale_ppc_template.c"
104 #undef FUNC
105 
106 #undef vzero
107 
108 #endif /* HAVE_BIGENDIAN */
109 
110 #define output_pixel(pos, val, bias, signedness) \
111  if (big_endian) { \
112  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
113  } else { \
114  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
115  }
116 
117 static void
118 yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
119 {
120  static const int big_endian = HAVE_BIGENDIAN;
121  static const int shift = 3;
122  static const float float_mult = 1.0f / 65535.0f;
123  int i, val;
124  uint16_t val_uint;
125 
126  for (i = start; i < dstW; ++i){
127  val = src[i] + (1 << (shift - 1));
128  output_pixel(&val_uint, val, 0, uint);
129  dest[i] = float_mult * (float)val_uint;
130  }
131 }
132 
133 static void
134 yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start)
135 {
136  static const int big_endian = HAVE_BIGENDIAN;
137  static const int shift = 3;
138  static const float float_mult = 1.0f / 65535.0f;
139  int i, val;
140  uint16_t val_uint;
141 
142  for (i = start; i < dstW; ++i){
143  val = src[i] + (1 << (shift - 1));
144  output_pixel(&val_uint, val, 0, uint);
145  dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
146  }
147 }
148 
149 static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
150 {
151  const int dst_u = -(uintptr_t)dest & 3;
152  const int shift = 3;
153  const int add = (1 << (shift - 1));
154  const int clip = (1 << 16) - 1;
155  const float fmult = 1.0f / 65535.0f;
156  const vec_u32 vadd = (vec_u32) {add, add, add, add};
157  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
158  const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
159  const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
160  const vec_f vzero = (vec_f) {0, 0, 0, 0};
161  vec_u32 v;
162  vec_f vd;
163  int i;
164 
165  yuv2plane1_float_u(src, dest, dst_u, 0);
166 
167  for (i = dst_u; i < dstW - 3; i += 4) {
168  v = vec_ld(0, (const uint32_t *) &src[i]);
169  v = vec_add(v, vadd);
170  v = vec_sr(v, vshift);
171  v = vec_min(v, vlargest);
172 
173  vd = vec_ctf(v, 0);
174  vd = vec_madd(vd, vmul, vzero);
175 
176  vec_st(vd, 0, &dest[i]);
177  }
178 
179  yuv2plane1_float_u(src, dest, dstW, i);
180 }
181 
182 static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW)
183 {
184  const int dst_u = -(uintptr_t)dest & 3;
185  const int shift = 3;
186  const int add = (1 << (shift - 1));
187  const int clip = (1 << 16) - 1;
188  const float fmult = 1.0f / 65535.0f;
189  const vec_u32 vadd = (vec_u32) {add, add, add, add};
190  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
191  const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
192  const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
193  const vec_f vzero = (vec_f) {0, 0, 0, 0};
194  const vec_u32 vswapbig = (vec_u32) {16, 16, 16, 16};
195  const vec_u16 vswapsmall = vec_splat_u16(8);
196  vec_u32 v;
197  vec_f vd;
198  int i;
199 
200  yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
201 
202  for (i = dst_u; i < dstW - 3; i += 4) {
203  v = vec_ld(0, (const uint32_t *) &src[i]);
204  v = vec_add(v, vadd);
205  v = vec_sr(v, vshift);
206  v = vec_min(v, vlargest);
207 
208  vd = vec_ctf(v, 0);
209  vd = vec_madd(vd, vmul, vzero);
210 
211  vd = (vec_f) vec_rl((vec_u32) vd, vswapbig);
212  vd = (vec_f) vec_rl((vec_u16) vd, vswapsmall);
213 
214  vec_st(vd, 0, (float *) &dest[i]);
215  }
216 
217  yuv2plane1_float_bswap_u(src, dest, dstW, i);
218 }
219 
220 #define yuv2plane1_float(template, dest_type, BE_LE) \
221 static void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \
222  int dstW, \
223  const uint8_t *dither, int offset) \
224 { \
225  template((const int32_t *)src, (dest_type *)dest, dstW); \
226 }
227 
228 #if HAVE_BIGENDIAN
229 yuv2plane1_float(yuv2plane1_float_altivec, float, BE)
230 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE)
231 #else
232 yuv2plane1_float(yuv2plane1_float_altivec, float, LE)
233 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE)
234 #endif
235 
236 #endif /* HAVE_ALTIVEC */
237 
239 {
240 #if HAVE_ALTIVEC
241  enum AVPixelFormat dstFormat = c->dstFormat;
242 
244  return;
245 
246 #if HAVE_BIGENDIAN
247  if (c->srcBpc == 8 && c->dstBpc <= 14) {
248  c->hyScale = c->hcScale = hScale_real_altivec;
249  }
250  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
251  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
252  !c->needAlpha) {
253  c->yuv2planeX = yuv2planeX_altivec;
254  }
255 #endif
256 
257  if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
258  c->yuv2plane1 = yuv2plane1_floatBE_altivec;
259  } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
260  c->yuv2plane1 = yuv2plane1_floatLE_altivec;
261  }
262 
263  /* The following list of supported dstFormat values should
264  * match what's found in the body of ff_yuv2packedX_altivec() */
265  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
266  switch (c->dstFormat) {
267  case AV_PIX_FMT_ABGR:
268  c->yuv2packedX = ff_yuv2abgr_X_altivec;
269  break;
270  case AV_PIX_FMT_BGRA:
271  c->yuv2packedX = ff_yuv2bgra_X_altivec;
272  break;
273  case AV_PIX_FMT_ARGB:
274  c->yuv2packedX = ff_yuv2argb_X_altivec;
275  break;
276  case AV_PIX_FMT_RGBA:
277  c->yuv2packedX = ff_yuv2rgba_X_altivec;
278  break;
279  case AV_PIX_FMT_BGR24:
280  c->yuv2packedX = ff_yuv2bgr24_X_altivec;
281  break;
282  case AV_PIX_FMT_RGB24:
283  c->yuv2packedX = ff_yuv2rgb24_X_altivec;
284  break;
285  }
286  }
287 #endif /* HAVE_ALTIVEC */
288 
290 }
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
void(* hcScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
#define vec_f
Definition: util_altivec.h:40
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:60
static int shift(int a, int b)
Definition: sonic.c:82
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Scale one horizontal line of input data using a filter over the input lines, to produce one (differen...
Macro definitions for various function/variable attributes.
#define av_cold
Definition: attributes.h:88
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
external API header
enum AVPixelFormat dstFormat
Destination pixel format.
yuv2packedX_fn yuv2packedX
#define src
Definition: vp8dsp.c:254
yuv2plane1_float(yuv2plane1_float_c_template, yuv2plane1_float(float, LE)
Definition: output.c:305
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
yuv2planar1_fn yuv2plane1
int32_t
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
#define av_bswap32
Definition: bswap.h:33
#define vec_u32
Definition: util_altivec.h:38
yuv2planarX_fn yuv2planeX
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
static av_always_inline uint32_t av_float2int(float f)
Reinterpret a float as a 32-bit integer.
Definition: intfloat.h:50
Contains misc utility macros and inline functions.
#define SWS_BITEXACT
Definition: swscale.h:84
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:888
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2076
#define vec_u16
Definition: util_altivec.h:36
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
int flags
Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
static double val(void *priv, double ch)
Definition: aeval.c:76
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
int i
Definition: input.c:406