FFmpeg
swscale_unscaled.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #include "config.h"
20 #include "libswscale/swscale.h"
22 #include "libavutil/aarch64/cpu.h"
23 
24 #define YUV_TO_RGB_TABLE \
25  c->yuv2rgb_v2r_coeff, \
26  c->yuv2rgb_u2g_coeff, \
27  c->yuv2rgb_v2g_coeff, \
28  c->yuv2rgb_u2b_coeff, \
29 
30 #define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
31 int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
32  uint8_t *dst, int linesize, \
33  const uint8_t *srcY, int linesizeY, \
34  const uint8_t *srcU, int linesizeU, \
35  const uint8_t *srcV, int linesizeV, \
36  const int16_t *table, \
37  int y_offset, \
38  int y_coeff); \
39  \
40 static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \
41  const int srcStride[], int srcSliceY, \
42  int srcSliceH, uint8_t *const dst[], \
43  const int dstStride[]) { \
44  const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
45  \
46  return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
47  dst[0] + srcSliceY * dstStride[0], dstStride[0], \
48  src[0], srcStride[0], \
49  src[1], srcStride[1], \
50  src[2], srcStride[2], \
51  yuv2rgb_table, \
52  c->yuv2rgb_y_offset >> 6, \
53  c->yuv2rgb_y_coeff); \
54 } \
55 
56 #define DECLARE_FF_YUVX_TO_GBRP_FUNCS(ifmt, ofmt) \
57 int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
58  uint8_t *dst, int linesize, \
59  const uint8_t *srcY, int linesizeY, \
60  const uint8_t *srcU, int linesizeU, \
61  const uint8_t *srcV, int linesizeV, \
62  const int16_t *table, \
63  int y_offset, \
64  int y_coeff, \
65  uint8_t *dst1, int linesize1, \
66  uint8_t *dst2, int linesize2); \
67  \
68 static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \
69  const int srcStride[], int srcSliceY, \
70  int srcSliceH, uint8_t *const dst[], \
71  const int dstStride[]) { \
72  const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
73  \
74  return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
75  dst[0] + srcSliceY * dstStride[0], dstStride[0], \
76  src[0], srcStride[0], \
77  src[1], srcStride[1], \
78  src[2], srcStride[2], \
79  yuv2rgb_table, \
80  c->yuv2rgb_y_offset >> 6, \
81  c->yuv2rgb_y_coeff, \
82  dst[1] + srcSliceY * dstStride[1], dstStride[1], \
83  dst[2] + srcSliceY * dstStride[2], dstStride[2]); \
84 } \
85 
86 #define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
87 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
88 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
89 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
90 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
91 DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp) \
92 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb24) \
93 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24) \
94 
97 
98 #define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt) \
99 int ff_yuva420p_to_##ofmt##_neon(int w, int h, \
100  uint8_t *dst, int linesize, \
101  const uint8_t *srcY, int linesizeY, \
102  const uint8_t *srcU, int linesizeU, \
103  const uint8_t *srcV, int linesizeV, \
104  const int16_t *table, \
105  int y_offset, int y_coeff, \
106  const uint8_t *srcA, int linesizeA); \
107  \
108 static int yuva420p_to_##ofmt##_neon_wrapper(SwsInternal *c, \
109  const uint8_t *const src[], \
110  const int srcStride[], int srcSliceY, \
111  int srcSliceH, uint8_t *const dst[], \
112  const int dstStride[]) { \
113  const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
114  \
115  return ff_yuva420p_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
116  dst[0] + srcSliceY * dstStride[0], dstStride[0], \
117  src[0], srcStride[0], \
118  src[1], srcStride[1], \
119  src[2], srcStride[2], \
120  yuv2rgb_table, \
121  c->yuv2rgb_y_offset >> 6, \
122  c->yuv2rgb_y_coeff, \
123  src[3], srcStride[3]); \
124 }
125 
130 
131 #define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
132 int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
133  uint8_t *dst, int linesize, \
134  const uint8_t *srcY, int linesizeY, \
135  const uint8_t *srcC, int linesizeC, \
136  const int16_t *table, \
137  int y_offset, \
138  int y_coeff); \
139  \
140 static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \
141  const int srcStride[], int srcSliceY, \
142  int srcSliceH, uint8_t *const dst[], \
143  const int dstStride[]) { \
144  const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
145  \
146  return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
147  dst[0] + srcSliceY * dstStride[0], dstStride[0], \
148  src[0], srcStride[0], src[1], srcStride[1], \
149  yuv2rgb_table, \
150  c->yuv2rgb_y_offset >> 6, \
151  c->yuv2rgb_y_coeff); \
152 } \
153 
154 #define DECLARE_FF_NVX_TO_GBRP_FUNCS(ifmt, ofmt) \
155 int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
156  uint8_t *dst, int linesize, \
157  const uint8_t *srcY, int linesizeY, \
158  const uint8_t *srcC, int linesizeC, \
159  const int16_t *table, \
160  int y_offset, \
161  int y_coeff, \
162  uint8_t *dst1, int linesize1, \
163  uint8_t *dst2, int linesize2); \
164  \
165 static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \
166  const int srcStride[], int srcSliceY, \
167  int srcSliceH, uint8_t *const dst[], \
168  const int dstStride[]) { \
169  const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
170  \
171  return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \
172  dst[0] + srcSliceY * dstStride[0], dstStride[0], \
173  src[0], srcStride[0], src[1], srcStride[1], \
174  yuv2rgb_table, \
175  c->yuv2rgb_y_offset >> 6, \
176  c->yuv2rgb_y_coeff, \
177  dst[1] + srcSliceY * dstStride[1], dstStride[1], \
178  dst[2] + srcSliceY * dstStride[2], dstStride[2]); \
179 } \
180 
181 void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
182  uint8_t *dst2, int dstStride2,
183  const uint8_t *src, int srcStride,
184  int w, int h);
185 
186 static int nv24_to_yuv420p_neon_wrapper(SwsInternal *c, const uint8_t *const src[],
187  const int srcStride[], int srcSliceY, int srcSliceH,
188  uint8_t *const dst[], const int dstStride[])
189 {
190  uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
191  uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
192 
193  ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->opts.src_w,
194  dst[0], dstStride[0]);
195 
196  if (c->opts.src_format == AV_PIX_FMT_NV24)
197  ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
198  src[1], srcStride[1], c->opts.src_w / 2,
199  srcSliceH);
200  else
201  ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
202  src[1], srcStride[1], c->opts.src_w / 2,
203  srcSliceH);
204 
205  return srcSliceH;
206 }
207 
208 #define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
209 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
210 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
211 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
212 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
213 DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp) \
214 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb24) \
215 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24) \
216 
219 
220 /* We need a 16 pixel width alignment. This constraint can easily be removed
221  * for input reading but for the output which is 4-bytes per pixel (RGBA) the
222  * assembly might be writing as much as 4*15=60 extra bytes at the end of the
223  * line, which won't fit the 32-bytes buffer alignment. */
224 #define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
225  if (c->opts.src_format == AV_PIX_FMT_##IFMT \
226  && c->opts.dst_format == AV_PIX_FMT_##OFMT \
227  && !(c->opts.src_h & 1) \
228  && !(c->opts.src_w & 15) \
229  && !accurate_rnd) \
230  c->convert_unscaled = ifmt##_to_##ofmt##_neon_wrapper; \
231 } while (0)
232 
233 #define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
234  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
235  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
236  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
237  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
238  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd); \
239  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb24, RGB24, accurate_rnd); \
240  SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd); \
241 } while (0)
242 
244  int accurate_rnd = c->opts.flags & SWS_ACCURATE_RND;
245 
246  SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
247  SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
248  SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
249  SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
250  SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd);
251  SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd);
252  SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd);
253  SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd);
254  SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd);
255  SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd);
256  SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd);
257 
258  if (c->opts.dst_format == AV_PIX_FMT_YUV420P &&
259  (c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) &&
260  !(c->opts.src_h & 1) && !(c->opts.src_w & 15) && !accurate_rnd)
261  c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
262 }
263 
265 {
266  int cpu_flags = av_get_cpu_flags();
267  if (have_neon(cpu_flags))
269 }
270 
272 {
273  int cpu_flags = av_get_cpu_flags();
274  if (!have_neon(cpu_flags) ||
275  (c->opts.src_h & 1) || (c->opts.src_w & 15) ||
276  (c->opts.flags & SWS_ACCURATE_RND))
277  return NULL;
278 
279  if (c->opts.src_format == AV_PIX_FMT_YUV420P) {
280  switch (c->opts.dst_format) {
281  case AV_PIX_FMT_ARGB: return yuv420p_to_argb_neon_wrapper;
282  case AV_PIX_FMT_RGBA: return yuv420p_to_rgba_neon_wrapper;
283  case AV_PIX_FMT_ABGR: return yuv420p_to_abgr_neon_wrapper;
284  case AV_PIX_FMT_BGRA: return yuv420p_to_bgra_neon_wrapper;
285  case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
286  case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
287  case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
288  }
289  } else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
290  switch (c->opts.dst_format) {
291 #if CONFIG_SWSCALE_ALPHA
292  case AV_PIX_FMT_ARGB: return yuva420p_to_argb_neon_wrapper;
293  case AV_PIX_FMT_RGBA: return yuva420p_to_rgba_neon_wrapper;
294  case AV_PIX_FMT_ABGR: return yuva420p_to_abgr_neon_wrapper;
295  case AV_PIX_FMT_BGRA: return yuva420p_to_bgra_neon_wrapper;
296 #endif
297  case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper;
298  case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper;
299  case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper;
300  }
301  } else if (c->opts.src_format == AV_PIX_FMT_YUV422P) {
302  switch (c->opts.dst_format) {
303  case AV_PIX_FMT_ARGB: return yuv422p_to_argb_neon_wrapper;
304  case AV_PIX_FMT_RGBA: return yuv422p_to_rgba_neon_wrapper;
305  case AV_PIX_FMT_ABGR: return yuv422p_to_abgr_neon_wrapper;
306  case AV_PIX_FMT_BGRA: return yuv422p_to_bgra_neon_wrapper;
307  case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper;
308  case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper;
309  case AV_PIX_FMT_GBRP: return yuv422p_to_gbrp_neon_wrapper;
310  }
311  }
312  return NULL;
313 }
get_unscaled_swscale_neon
static void get_unscaled_swscale_neon(SwsInternal *c)
Definition: swscale_unscaled.c:243
SET_FF_NVX_TO_ALL_RGBX_FUNC
#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd)
Definition: swscale_unscaled.c:233
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
av_cold
#define av_cold
Definition: attributes.h:106
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx)
Definition: swscale_unscaled.c:86
AV_PIX_FMT_YUVA420P
@ AV_PIX_FMT_YUVA420P
planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
Definition: pixfmt.h:108
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:73
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
NULL
#define NULL
Definition: coverity.c:32
ff_get_unscaled_swscale_aarch64
void ff_get_unscaled_swscale_aarch64(SwsInternal *c)
Definition: swscale_unscaled.c:264
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
have_neon
#define have_neon(flags)
Definition: cpu.h:26
DECLARE_FF_YUVA420P_TO_RGBX_FUNCS
#define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt)
Definition: swscale_unscaled.c:98
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
ff_yuv2rgb_init_aarch64
av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c)
Definition: swscale_unscaled.c:271
ff_copyPlane
void ff_copyPlane(const uint8_t *src, int srcStride, int srcSliceY, int srcSliceH, int width, uint8_t *dst, int dstStride)
Definition: swscale_unscaled.c:125
AV_PIX_FMT_NV24
@ AV_PIX_FMT_NV24
planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:371
swscale_internal.h
SET_FF_NVX_TO_RGBX_FUNC
#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd)
Definition: swscale_unscaled.c:224
ff_nv24_to_yuv420p_chroma_neon
void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1, uint8_t *dst2, int dstStride2, const uint8_t *src, int srcStride, int w, int h)
AV_PIX_FMT_NV42
@ AV_PIX_FMT_NV42
as above, but U and V bytes are swapped
Definition: pixfmt.h:372
SwsInternal
Definition: swscale_internal.h:334
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx)
Definition: swscale_unscaled.c:208
AV_PIX_FMT_GBRP
@ AV_PIX_FMT_GBRP
planar GBR 4:4:4 24bpp
Definition: pixfmt.h:165
AV_PIX_FMT_YUV422P
@ AV_PIX_FMT_YUV422P
planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
Definition: pixfmt.h:77
w
uint8_t w
Definition: llvidencdsp.c:39
SwsFunc
int(* SwsFunc)(SwsInternal *c, const uint8_t *const src[], const int srcStride[], int srcSliceY, int srcSliceH, uint8_t *const dst[], const int dstStride[])
Definition: swscale_internal.h:96
RGBA
#define RGBA(r, g, b, a)
Definition: dvbsubdec.c:42
nv24_to_yuv420p_neon_wrapper
static int nv24_to_yuv420p_neon_wrapper(SwsInternal *c, const uint8_t *const src[], const int srcStride[], int srcSliceY, int srcSliceH, uint8_t *const dst[], const int dstStride[])
Definition: swscale_unscaled.c:186
SWS_ACCURATE_RND
@ SWS_ACCURATE_RND
Force bit-exact output.
Definition: swscale.h:157
h
h
Definition: vp9dsp_template.c:2070
cpu.h
src
#define src
Definition: vp8dsp.c:248
swscale.h