FFmpeg
hscale_fast_bilinear_simd.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "../swscale_internal.h"
22 #include "libavutil/attributes.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavutil/x86/cpu.h"
25 #include "libavutil/mem_internal.h"
26 
27 #define RET 0xC3 // near return opcode for x86
28 #define PREFETCH "prefetchnta"
29 
30 #if HAVE_INLINE_ASM
31 av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
32  int16_t *filter, int32_t *filterPos,
33  int numSplits)
34 {
35  uint8_t *fragmentA;
36  x86_reg imm8OfPShufW1A;
37  x86_reg imm8OfPShufW2A;
38  x86_reg fragmentLengthA;
39  uint8_t *fragmentB;
40  x86_reg imm8OfPShufW1B;
41  x86_reg imm8OfPShufW2B;
42  x86_reg fragmentLengthB;
43  int fragmentPos;
44 
45  int xpos, i;
46 
47  // create an optimized horizontal scaling routine
48  /* This scaler is made of runtime-generated MMXEXT code using specially tuned
49  * pshufw instructions. For every four output pixels, if four input pixels
50  * are enough for the fast bilinear scaling, then a chunk of fragmentB is
51  * used. If five input pixels are needed, then a chunk of fragmentA is used.
52  */
53 
54  // code fragment
55 
56  __asm__ volatile (
57  "jmp 9f \n\t"
58  // Begin
59  "0: \n\t"
60  "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
61  "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
62  "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
63  "punpcklbw %%mm7, %%mm1 \n\t"
64  "punpcklbw %%mm7, %%mm0 \n\t"
65  "pshufw $0xFF, %%mm1, %%mm1 \n\t"
66  "1: \n\t"
67  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
68  "2: \n\t"
69  "psubw %%mm1, %%mm0 \n\t"
70  "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
71  "pmullw %%mm3, %%mm0 \n\t"
72  "psllw $7, %%mm1 \n\t"
73  "paddw %%mm1, %%mm0 \n\t"
74 
75  "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
76 
77  "add $8, %%"FF_REG_a" \n\t"
78  // End
79  "9: \n\t"
80  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
81  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
82  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
83  "dec %1 \n\t"
84  "dec %2 \n\t"
85  "sub %0, %1 \n\t"
86  "sub %0, %2 \n\t"
87  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
88  "sub %0, %3 \n\t"
89 
90 
91  : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
92  "=r" (fragmentLengthA)
93  );
94 
95  __asm__ volatile (
96  "jmp 9f \n\t"
97  // Begin
98  "0: \n\t"
99  "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
100  "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
101  "punpcklbw %%mm7, %%mm0 \n\t"
102  "pshufw $0xFF, %%mm0, %%mm1 \n\t"
103  "1: \n\t"
104  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
105  "2: \n\t"
106  "psubw %%mm1, %%mm0 \n\t"
107  "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
108  "pmullw %%mm3, %%mm0 \n\t"
109  "psllw $7, %%mm1 \n\t"
110  "paddw %%mm1, %%mm0 \n\t"
111 
112  "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
113 
114  "add $8, %%"FF_REG_a" \n\t"
115  // End
116  "9: \n\t"
117  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
118  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
119  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
120  "dec %1 \n\t"
121  "dec %2 \n\t"
122  "sub %0, %1 \n\t"
123  "sub %0, %2 \n\t"
124  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
125  "sub %0, %3 \n\t"
126 
127 
128  : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
129  "=r" (fragmentLengthB)
130  );
131 
132  xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
133  fragmentPos = 0;
134 
135  for (i = 0; i < dstW / numSplits; i++) {
136  int xx = xpos >> 16;
137 
138  if ((i & 3) == 0) {
139  int a = 0;
140  int b = ((xpos + xInc) >> 16) - xx;
141  int c = ((xpos + xInc * 2) >> 16) - xx;
142  int d = ((xpos + xInc * 3) >> 16) - xx;
143  int inc = (d + 1 < 4);
144  uint8_t *fragment = inc ? fragmentB : fragmentA;
145  x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
146  x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
147  x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
148  int maxShift = 3 - (d + inc);
149  int shift = 0;
150 
151  if (filterCode) {
152  filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
153  filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
154  filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
155  filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
156  filterPos[i / 2] = xx;
157 
158  memcpy(filterCode + fragmentPos, fragment, fragmentLength);
159 
160  filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
161  ((b + inc) << 2) |
162  ((c + inc) << 4) |
163  ((d + inc) << 6);
164  filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
165  (c << 4) |
166  (d << 6);
167 
168  if (i + 4 - inc >= dstW)
169  shift = maxShift; // avoid overread
170  else if ((filterPos[i / 2] & 3) <= maxShift)
171  shift = filterPos[i / 2] & 3; // align
172 
173  if (shift && i >= shift) {
174  filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
175  filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
176  filterPos[i / 2] -= shift;
177  }
178  }
179 
180  fragmentPos += fragmentLength;
181 
182  if (filterCode)
183  filterCode[fragmentPos] = RET;
184  }
185  xpos += xInc;
186  }
187  if (filterCode)
188  filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
189 
190  return fragmentPos + 1;
191 }
192 
193 void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
194  int dstWidth, const uint8_t *src,
195  int srcW, int xInc)
196 {
197  int32_t *filterPos = c->hLumFilterPos;
198  int16_t *filter = c->hLumFilter;
199  void *mmxextFilterCode = c->lumMmxextFilterCode;
200  int i;
201 #if ARCH_X86_64
202  uint64_t retsave;
203 #else
204 #if !HAVE_EBX_AVAILABLE
205  uint64_t ebxsave;
206 #endif
207 #endif
208 
209  __asm__ volatile(
210 #if ARCH_X86_64
211  "mov -8(%%rsp), %%"FF_REG_a" \n\t"
212  "mov %%"FF_REG_a", %5 \n\t" // retsave
213 #else
214 #if !HAVE_EBX_AVAILABLE
215  "mov %%"FF_REG_b", %5 \n\t" // ebxsave
216 #endif
217 #endif
218  "pxor %%mm7, %%mm7 \n\t"
219  "mov %0, %%"FF_REG_c" \n\t"
220  "mov %1, %%"FF_REG_D" \n\t"
221  "mov %2, %%"FF_REG_d" \n\t"
222  "mov %3, %%"FF_REG_b" \n\t"
223  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
224  PREFETCH" (%%"FF_REG_c") \n\t"
225  PREFETCH" 32(%%"FF_REG_c") \n\t"
226  PREFETCH" 64(%%"FF_REG_c") \n\t"
227 
228 #if ARCH_X86_64
229 #define CALL_MMXEXT_FILTER_CODE \
230  "movl (%%"FF_REG_b"), %%esi \n\t"\
231  "call *%4 \n\t"\
232  "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
233  "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
234  "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
235  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
236 
237 #else
238 #define CALL_MMXEXT_FILTER_CODE \
239  "movl (%%"FF_REG_b"), %%esi \n\t"\
240  "call *%4 \n\t"\
241  "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
242  "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
243  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
244 
245 #endif /* ARCH_X86_64 */
246 
247  CALL_MMXEXT_FILTER_CODE
248  CALL_MMXEXT_FILTER_CODE
249  CALL_MMXEXT_FILTER_CODE
250  CALL_MMXEXT_FILTER_CODE
251  CALL_MMXEXT_FILTER_CODE
252  CALL_MMXEXT_FILTER_CODE
253  CALL_MMXEXT_FILTER_CODE
254  CALL_MMXEXT_FILTER_CODE
255 
256 #if ARCH_X86_64
257  "mov %5, %%"FF_REG_a" \n\t"
258  "mov %%"FF_REG_a", -8(%%rsp) \n\t"
259 #else
260 #if !HAVE_EBX_AVAILABLE
261  "mov %5, %%"FF_REG_b" \n\t"
262 #endif
263 #endif
264  :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
265  "m" (mmxextFilterCode)
266 #if ARCH_X86_64
267  ,"m"(retsave)
268 #else
269 #if !HAVE_EBX_AVAILABLE
270  ,"m" (ebxsave)
271 #endif
272 #endif
273  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
274 #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
275  ,"%"FF_REG_b
276 #endif
277  );
278 
279  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
280  dst[i] = src[srcW-1]*128;
281 }
282 
283 void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
284  int dstWidth, const uint8_t *src1,
285  const uint8_t *src2, int srcW, int xInc)
286 {
287  int32_t *filterPos = c->hChrFilterPos;
288  int16_t *filter = c->hChrFilter;
289  void *mmxextFilterCode = c->chrMmxextFilterCode;
290  int i;
291 #if ARCH_X86_64
292  DECLARE_ALIGNED(8, uint64_t, retsave);
293 #else
294 #if !HAVE_EBX_AVAILABLE
295  DECLARE_ALIGNED(8, uint64_t, ebxsave);
296 #endif
297 #endif
298  __asm__ volatile(
299 #if ARCH_X86_64
300  "mov -8(%%rsp), %%"FF_REG_a" \n\t"
301  "mov %%"FF_REG_a", %7 \n\t" // retsave
302 #else
303 #if !HAVE_EBX_AVAILABLE
304  "mov %%"FF_REG_b", %7 \n\t" // ebxsave
305 #endif
306 #endif
307  "pxor %%mm7, %%mm7 \n\t"
308  "mov %0, %%"FF_REG_c" \n\t"
309  "mov %1, %%"FF_REG_D" \n\t"
310  "mov %2, %%"FF_REG_d" \n\t"
311  "mov %3, %%"FF_REG_b" \n\t"
312  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
313  PREFETCH" (%%"FF_REG_c") \n\t"
314  PREFETCH" 32(%%"FF_REG_c") \n\t"
315  PREFETCH" 64(%%"FF_REG_c") \n\t"
316 
317  CALL_MMXEXT_FILTER_CODE
318  CALL_MMXEXT_FILTER_CODE
319  CALL_MMXEXT_FILTER_CODE
320  CALL_MMXEXT_FILTER_CODE
321  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
322  "mov %5, %%"FF_REG_c" \n\t" // src2
323  "mov %6, %%"FF_REG_D" \n\t" // dst2
324  PREFETCH" (%%"FF_REG_c") \n\t"
325  PREFETCH" 32(%%"FF_REG_c") \n\t"
326  PREFETCH" 64(%%"FF_REG_c") \n\t"
327 
328  CALL_MMXEXT_FILTER_CODE
329  CALL_MMXEXT_FILTER_CODE
330  CALL_MMXEXT_FILTER_CODE
331  CALL_MMXEXT_FILTER_CODE
332 
333 #if ARCH_X86_64
334  "mov %7, %%"FF_REG_a" \n\t"
335  "mov %%"FF_REG_a", -8(%%rsp) \n\t"
336 #else
337 #if !HAVE_EBX_AVAILABLE
338  "mov %7, %%"FF_REG_b" \n\t"
339 #endif
340 #endif
341  :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
342  "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
343 #if ARCH_X86_64
344  ,"m"(retsave)
345 #else
346 #if !HAVE_EBX_AVAILABLE
347  ,"m" (ebxsave)
348 #endif
349 #endif
350  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
351 #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
352  ,"%"FF_REG_b
353 #endif
354  );
355 
356  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
357  dst1[i] = src1[srcW-1]*128;
358  dst2[i] = src2[srcW-1]*128;
359  }
360 }
361 #endif //HAVE_INLINE_ASM
cpu.h
mem_internal.h
src1
const pixel * src1
Definition: h264pred_template.c:421
b
#define b
Definition: input.c:41
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
fragment
Definition: dashdec.c:37
ff_hcscale_fast_mmxext
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
av_cold
#define av_cold
Definition: attributes.h:90
ff_init_hscaler_mmxext
int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:109
shift
static int shift(int a, int b)
Definition: bonk.c:261
asm.h
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
attributes.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
LOCAL_MANGLE
#define LOCAL_MANGLE(a)
Definition: asm.h:109
src2
const pixel * src2
Definition: h264pred_template.c:422
PREFETCH
#define PREFETCH
Definition: hscale_fast_bilinear_simd.c:28
RET
#define RET
Definition: hscale_fast_bilinear_simd.c:27
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ff_hyscale_fast_mmxext
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
x86_reg
int x86_reg
Definition: asm.h:72
d
d
Definition: ffmpeg_filter.c:424
int32_t
int32_t
Definition: audioconvert.c:56
SwsContext
Definition: swscale_internal.h:301