FFmpeg
hscale_fast_bilinear_simd.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "../swscale_internal.h"
22 #include "libavutil/x86/asm.h"
23 #include "libavutil/x86/cpu.h"
24 
25 #define RET 0xC3 // near return opcode for x86
26 #define PREFETCH "prefetchnta"
27 
28 #if HAVE_INLINE_ASM
29 av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
30  int16_t *filter, int32_t *filterPos,
31  int numSplits)
32 {
33  uint8_t *fragmentA;
34  x86_reg imm8OfPShufW1A;
35  x86_reg imm8OfPShufW2A;
36  x86_reg fragmentLengthA;
37  uint8_t *fragmentB;
38  x86_reg imm8OfPShufW1B;
39  x86_reg imm8OfPShufW2B;
40  x86_reg fragmentLengthB;
41  int fragmentPos;
42 
43  int xpos, i;
44 
45  // create an optimized horizontal scaling routine
46  /* This scaler is made of runtime-generated MMXEXT code using specially tuned
47  * pshufw instructions. For every four output pixels, if four input pixels
48  * are enough for the fast bilinear scaling, then a chunk of fragmentB is
49  * used. If five input pixels are needed, then a chunk of fragmentA is used.
50  */
51 
52  // code fragment
53 
54  __asm__ volatile (
55  "jmp 9f \n\t"
56  // Begin
57  "0: \n\t"
58  "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
59  "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
60  "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
61  "punpcklbw %%mm7, %%mm1 \n\t"
62  "punpcklbw %%mm7, %%mm0 \n\t"
63  "pshufw $0xFF, %%mm1, %%mm1 \n\t"
64  "1: \n\t"
65  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
66  "2: \n\t"
67  "psubw %%mm1, %%mm0 \n\t"
68  "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
69  "pmullw %%mm3, %%mm0 \n\t"
70  "psllw $7, %%mm1 \n\t"
71  "paddw %%mm1, %%mm0 \n\t"
72 
73  "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
74 
75  "add $8, %%"FF_REG_a" \n\t"
76  // End
77  "9: \n\t"
78  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
79  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
80  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
81  "dec %1 \n\t"
82  "dec %2 \n\t"
83  "sub %0, %1 \n\t"
84  "sub %0, %2 \n\t"
85  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
86  "sub %0, %3 \n\t"
87 
88 
89  : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
90  "=r" (fragmentLengthA)
91  );
92 
93  __asm__ volatile (
94  "jmp 9f \n\t"
95  // Begin
96  "0: \n\t"
97  "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
98  "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
99  "punpcklbw %%mm7, %%mm0 \n\t"
100  "pshufw $0xFF, %%mm0, %%mm1 \n\t"
101  "1: \n\t"
102  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
103  "2: \n\t"
104  "psubw %%mm1, %%mm0 \n\t"
105  "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
106  "pmullw %%mm3, %%mm0 \n\t"
107  "psllw $7, %%mm1 \n\t"
108  "paddw %%mm1, %%mm0 \n\t"
109 
110  "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
111 
112  "add $8, %%"FF_REG_a" \n\t"
113  // End
114  "9: \n\t"
115  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
116  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
117  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
118  "dec %1 \n\t"
119  "dec %2 \n\t"
120  "sub %0, %1 \n\t"
121  "sub %0, %2 \n\t"
122  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
123  "sub %0, %3 \n\t"
124 
125 
126  : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
127  "=r" (fragmentLengthB)
128  );
129 
130  xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
131  fragmentPos = 0;
132 
133  for (i = 0; i < dstW / numSplits; i++) {
134  int xx = xpos >> 16;
135 
136  if ((i & 3) == 0) {
137  int a = 0;
138  int b = ((xpos + xInc) >> 16) - xx;
139  int c = ((xpos + xInc * 2) >> 16) - xx;
140  int d = ((xpos + xInc * 3) >> 16) - xx;
141  int inc = (d + 1 < 4);
142  uint8_t *fragment = inc ? fragmentB : fragmentA;
143  x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
144  x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
145  x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
146  int maxShift = 3 - (d + inc);
147  int shift = 0;
148 
149  if (filterCode) {
150  filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
151  filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
152  filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
153  filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
154  filterPos[i / 2] = xx;
155 
156  memcpy(filterCode + fragmentPos, fragment, fragmentLength);
157 
158  filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
159  ((b + inc) << 2) |
160  ((c + inc) << 4) |
161  ((d + inc) << 6);
162  filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
163  (c << 4) |
164  (d << 6);
165 
166  if (i + 4 - inc >= dstW)
167  shift = maxShift; // avoid overread
168  else if ((filterPos[i / 2] & 3) <= maxShift)
169  shift = filterPos[i / 2] & 3; // align
170 
171  if (shift && i >= shift) {
172  filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
173  filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
174  filterPos[i / 2] -= shift;
175  }
176  }
177 
178  fragmentPos += fragmentLength;
179 
180  if (filterCode)
181  filterCode[fragmentPos] = RET;
182  }
183  xpos += xInc;
184  }
185  if (filterCode)
186  filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
187 
188  return fragmentPos + 1;
189 }
190 
191 void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
192  int dstWidth, const uint8_t *src,
193  int srcW, int xInc)
194 {
195  int32_t *filterPos = c->hLumFilterPos;
196  int16_t *filter = c->hLumFilter;
197  void *mmxextFilterCode = c->lumMmxextFilterCode;
198  int i;
199 #if ARCH_X86_64
200  uint64_t retsave;
201 #else
202 #if !HAVE_EBX_AVAILABLE
203  uint64_t ebxsave;
204 #endif
205 #endif
206 
207  __asm__ volatile(
208 #if ARCH_X86_64
209  "mov -8(%%rsp), %%"FF_REG_a" \n\t"
210  "mov %%"FF_REG_a", %5 \n\t" // retsave
211 #else
212 #if !HAVE_EBX_AVAILABLE
213  "mov %%"FF_REG_b", %5 \n\t" // ebxsave
214 #endif
215 #endif
216  "pxor %%mm7, %%mm7 \n\t"
217  "mov %0, %%"FF_REG_c" \n\t"
218  "mov %1, %%"FF_REG_D" \n\t"
219  "mov %2, %%"FF_REG_d" \n\t"
220  "mov %3, %%"FF_REG_b" \n\t"
221  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
222  PREFETCH" (%%"FF_REG_c") \n\t"
223  PREFETCH" 32(%%"FF_REG_c") \n\t"
224  PREFETCH" 64(%%"FF_REG_c") \n\t"
225 
226 #if ARCH_X86_64
227 #define CALL_MMXEXT_FILTER_CODE \
228  "movl (%%"FF_REG_b"), %%esi \n\t"\
229  "call *%4 \n\t"\
230  "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
231  "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
232  "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
233  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
234 
235 #else
236 #define CALL_MMXEXT_FILTER_CODE \
237  "movl (%%"FF_REG_b"), %%esi \n\t"\
238  "call *%4 \n\t"\
239  "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
240  "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
241  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
242 
243 #endif /* ARCH_X86_64 */
244 
245  CALL_MMXEXT_FILTER_CODE
246  CALL_MMXEXT_FILTER_CODE
247  CALL_MMXEXT_FILTER_CODE
248  CALL_MMXEXT_FILTER_CODE
249  CALL_MMXEXT_FILTER_CODE
250  CALL_MMXEXT_FILTER_CODE
251  CALL_MMXEXT_FILTER_CODE
252  CALL_MMXEXT_FILTER_CODE
253 
254 #if ARCH_X86_64
255  "mov %5, %%"FF_REG_a" \n\t"
256  "mov %%"FF_REG_a", -8(%%rsp) \n\t"
257 #else
258 #if !HAVE_EBX_AVAILABLE
259  "mov %5, %%"FF_REG_b" \n\t"
260 #endif
261 #endif
262  :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
263  "m" (mmxextFilterCode)
264 #if ARCH_X86_64
265  ,"m"(retsave)
266 #else
267 #if !HAVE_EBX_AVAILABLE
268  ,"m" (ebxsave)
269 #endif
270 #endif
271  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
272 #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
273  ,"%"FF_REG_b
274 #endif
275  );
276 
277  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
278  dst[i] = src[srcW-1]*128;
279 }
280 
281 void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
282  int dstWidth, const uint8_t *src1,
283  const uint8_t *src2, int srcW, int xInc)
284 {
285  int32_t *filterPos = c->hChrFilterPos;
286  int16_t *filter = c->hChrFilter;
287  void *mmxextFilterCode = c->chrMmxextFilterCode;
288  int i;
289 #if ARCH_X86_64
290  DECLARE_ALIGNED(8, uint64_t, retsave);
291 #else
292 #if !HAVE_EBX_AVAILABLE
293  DECLARE_ALIGNED(8, uint64_t, ebxsave);
294 #endif
295 #endif
296  __asm__ volatile(
297 #if ARCH_X86_64
298  "mov -8(%%rsp), %%"FF_REG_a" \n\t"
299  "mov %%"FF_REG_a", %7 \n\t" // retsave
300 #else
301 #if !HAVE_EBX_AVAILABLE
302  "mov %%"FF_REG_b", %7 \n\t" // ebxsave
303 #endif
304 #endif
305  "pxor %%mm7, %%mm7 \n\t"
306  "mov %0, %%"FF_REG_c" \n\t"
307  "mov %1, %%"FF_REG_D" \n\t"
308  "mov %2, %%"FF_REG_d" \n\t"
309  "mov %3, %%"FF_REG_b" \n\t"
310  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
311  PREFETCH" (%%"FF_REG_c") \n\t"
312  PREFETCH" 32(%%"FF_REG_c") \n\t"
313  PREFETCH" 64(%%"FF_REG_c") \n\t"
314 
315  CALL_MMXEXT_FILTER_CODE
316  CALL_MMXEXT_FILTER_CODE
317  CALL_MMXEXT_FILTER_CODE
318  CALL_MMXEXT_FILTER_CODE
319  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
320  "mov %5, %%"FF_REG_c" \n\t" // src2
321  "mov %6, %%"FF_REG_D" \n\t" // dst2
322  PREFETCH" (%%"FF_REG_c") \n\t"
323  PREFETCH" 32(%%"FF_REG_c") \n\t"
324  PREFETCH" 64(%%"FF_REG_c") \n\t"
325 
326  CALL_MMXEXT_FILTER_CODE
327  CALL_MMXEXT_FILTER_CODE
328  CALL_MMXEXT_FILTER_CODE
329  CALL_MMXEXT_FILTER_CODE
330 
331 #if ARCH_X86_64
332  "mov %7, %%"FF_REG_a" \n\t"
333  "mov %%"FF_REG_a", -8(%%rsp) \n\t"
334 #else
335 #if !HAVE_EBX_AVAILABLE
336  "mov %7, %%"FF_REG_b" \n\t"
337 #endif
338 #endif
339  :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
340  "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
341 #if ARCH_X86_64
342  ,"m"(retsave)
343 #else
344 #if !HAVE_EBX_AVAILABLE
345  ,"m" (ebxsave)
346 #endif
347 #endif
348  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
349 #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
350  ,"%"FF_REG_b
351 #endif
352  );
353 
354  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
355  dst1[i] = src1[srcW-1]*128;
356  dst2[i] = src2[srcW-1]*128;
357  }
358 }
359 #endif //HAVE_INLINE_ASM
cpu.h
b
#define b
Definition: input.c:41
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
fragment
Definition: dashdec.c:33
src
#define src
Definition: vp8dsp.c:254
ff_hcscale_fast_mmxext
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
av_cold
#define av_cold
Definition: attributes.h:84
int32_t
int32_t
Definition: audio_convert.c:194
ff_init_hscaler_mmxext
int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
asm.h
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
src1
#define src1
Definition: h264pred.c:139
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
LOCAL_MANGLE
#define LOCAL_MANGLE(a)
Definition: asm.h:109
uint8_t
uint8_t
Definition: audio_convert.c:194
PREFETCH
#define PREFETCH
Definition: hscale_fast_bilinear_simd.c:26
RET
#define RET
Definition: hscale_fast_bilinear_simd.c:25
ff_hyscale_fast_mmxext
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
shift
static int shift(int a, int b)
Definition: sonic.c:82
x86_reg
int x86_reg
Definition: asm.h:72
SwsContext
Definition: swscale_internal.h:280