FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hscale_fast_bilinear_simd.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "../swscale_internal.h"
22 #include "libavutil/x86/asm.h"
23 #include "libavutil/x86/cpu.h"
24 
25 #define RET 0xC3 // near return opcode for x86
26 #define PREFETCH "prefetchnta"
27 
28 #if HAVE_INLINE_ASM
29 av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
30  int16_t *filter, int32_t *filterPos,
31  int numSplits)
32 {
33  uint8_t *fragmentA;
34  x86_reg imm8OfPShufW1A;
35  x86_reg imm8OfPShufW2A;
36  x86_reg fragmentLengthA;
37  uint8_t *fragmentB;
38  x86_reg imm8OfPShufW1B;
39  x86_reg imm8OfPShufW2B;
40  x86_reg fragmentLengthB;
41  int fragmentPos;
42 
43  int xpos, i;
44 
45  // create an optimized horizontal scaling routine
46  /* This scaler is made of runtime-generated MMXEXT code using specially tuned
47  * pshufw instructions. For every four output pixels, if four input pixels
48  * are enough for the fast bilinear scaling, then a chunk of fragmentB is
49  * used. If five input pixels are needed, then a chunk of fragmentA is used.
50  */
51 
52  // code fragment
53 
54  __asm__ volatile (
55  "jmp 9f \n\t"
56  // Begin
57  "0: \n\t"
58  "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
59  "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
60  "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
61  "punpcklbw %%mm7, %%mm1 \n\t"
62  "punpcklbw %%mm7, %%mm0 \n\t"
63  "pshufw $0xFF, %%mm1, %%mm1 \n\t"
64  "1: \n\t"
65  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
66  "2: \n\t"
67  "psubw %%mm1, %%mm0 \n\t"
68  "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
69  "pmullw %%mm3, %%mm0 \n\t"
70  "psllw $7, %%mm1 \n\t"
71  "paddw %%mm1, %%mm0 \n\t"
72 
73  "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
74 
75  "add $8, %%"REG_a" \n\t"
76  // End
77  "9: \n\t"
78  // "int $3 \n\t"
79  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
80  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
81  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
82  "dec %1 \n\t"
83  "dec %2 \n\t"
84  "sub %0, %1 \n\t"
85  "sub %0, %2 \n\t"
86  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
87  "sub %0, %3 \n\t"
88 
89 
90  : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
91  "=r" (fragmentLengthA)
92  );
93 
94  __asm__ volatile (
95  "jmp 9f \n\t"
96  // Begin
97  "0: \n\t"
98  "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
99  "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
100  "punpcklbw %%mm7, %%mm0 \n\t"
101  "pshufw $0xFF, %%mm0, %%mm1 \n\t"
102  "1: \n\t"
103  "pshufw $0xFF, %%mm0, %%mm0 \n\t"
104  "2: \n\t"
105  "psubw %%mm1, %%mm0 \n\t"
106  "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
107  "pmullw %%mm3, %%mm0 \n\t"
108  "psllw $7, %%mm1 \n\t"
109  "paddw %%mm1, %%mm0 \n\t"
110 
111  "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
112 
113  "add $8, %%"REG_a" \n\t"
114  // End
115  "9: \n\t"
116  // "int $3 \n\t"
117  "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
118  "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
119  "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
120  "dec %1 \n\t"
121  "dec %2 \n\t"
122  "sub %0, %1 \n\t"
123  "sub %0, %2 \n\t"
124  "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
125  "sub %0, %3 \n\t"
126 
127 
128  : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
129  "=r" (fragmentLengthB)
130  );
131 
132  xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
133  fragmentPos = 0;
134 
135  for (i = 0; i < dstW / numSplits; i++) {
136  int xx = xpos >> 16;
137 
138  if ((i & 3) == 0) {
139  int a = 0;
140  int b = ((xpos + xInc) >> 16) - xx;
141  int c = ((xpos + xInc * 2) >> 16) - xx;
142  int d = ((xpos + xInc * 3) >> 16) - xx;
143  int inc = (d + 1 < 4);
144  uint8_t *fragment = inc ? fragmentB : fragmentA;
145  x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
146  x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
147  x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
148  int maxShift = 3 - (d + inc);
149  int shift = 0;
150 
151  if (filterCode) {
152  filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
153  filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
154  filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
155  filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
156  filterPos[i / 2] = xx;
157 
158  memcpy(filterCode + fragmentPos, fragment, fragmentLength);
159 
160  filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
161  ((b + inc) << 2) |
162  ((c + inc) << 4) |
163  ((d + inc) << 6);
164  filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
165  (c << 4) |
166  (d << 6);
167 
168  if (i + 4 - inc >= dstW)
169  shift = maxShift; // avoid overread
170  else if ((filterPos[i / 2] & 3) <= maxShift)
171  shift = filterPos[i / 2] & 3; // align
172 
173  if (shift && i >= shift) {
174  filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
175  filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
176  filterPos[i / 2] -= shift;
177  }
178  }
179 
180  fragmentPos += fragmentLength;
181 
182  if (filterCode)
183  filterCode[fragmentPos] = RET;
184  }
185  xpos += xInc;
186  }
187  if (filterCode)
188  filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
189 
190  return fragmentPos + 1;
191 }
192 
193 void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
194  int dstWidth, const uint8_t *src,
195  int srcW, int xInc)
196 {
197  int32_t *filterPos = c->hLumFilterPos;
198  int16_t *filter = c->hLumFilter;
199  void *mmxextFilterCode = c->lumMmxextFilterCode;
200  int i;
201 #if defined(PIC)
202  uint64_t ebxsave;
203 #endif
204 #if ARCH_X86_64
205  uint64_t retsave;
206 #endif
207 
208  __asm__ volatile(
209 #if defined(PIC)
210  "mov %%"REG_b", %5 \n\t"
211 #if ARCH_X86_64
212  "mov -8(%%rsp), %%"REG_a" \n\t"
213  "mov %%"REG_a", %6 \n\t"
214 #endif
215 #else
216 #if ARCH_X86_64
217  "mov -8(%%rsp), %%"REG_a" \n\t"
218  "mov %%"REG_a", %5 \n\t"
219 #endif
220 #endif
221  "pxor %%mm7, %%mm7 \n\t"
222  "mov %0, %%"REG_c" \n\t"
223  "mov %1, %%"REG_D" \n\t"
224  "mov %2, %%"REG_d" \n\t"
225  "mov %3, %%"REG_b" \n\t"
226  "xor %%"REG_a", %%"REG_a" \n\t" // i
227  PREFETCH" (%%"REG_c") \n\t"
228  PREFETCH" 32(%%"REG_c") \n\t"
229  PREFETCH" 64(%%"REG_c") \n\t"
230 
231 #if ARCH_X86_64
232 #define CALL_MMXEXT_FILTER_CODE \
233  "movl (%%"REG_b"), %%esi \n\t"\
234  "call *%4 \n\t"\
235  "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
236  "add %%"REG_S", %%"REG_c" \n\t"\
237  "add %%"REG_a", %%"REG_D" \n\t"\
238  "xor %%"REG_a", %%"REG_a" \n\t"\
239 
240 #else
241 #define CALL_MMXEXT_FILTER_CODE \
242  "movl (%%"REG_b"), %%esi \n\t"\
243  "call *%4 \n\t"\
244  "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
245  "add %%"REG_a", %%"REG_D" \n\t"\
246  "xor %%"REG_a", %%"REG_a" \n\t"\
247 
248 #endif /* ARCH_X86_64 */
249 
250  CALL_MMXEXT_FILTER_CODE
251  CALL_MMXEXT_FILTER_CODE
252  CALL_MMXEXT_FILTER_CODE
253  CALL_MMXEXT_FILTER_CODE
254  CALL_MMXEXT_FILTER_CODE
255  CALL_MMXEXT_FILTER_CODE
256  CALL_MMXEXT_FILTER_CODE
257  CALL_MMXEXT_FILTER_CODE
258 
259 #if defined(PIC)
260  "mov %5, %%"REG_b" \n\t"
261 #if ARCH_X86_64
262  "mov %6, %%"REG_a" \n\t"
263  "mov %%"REG_a", -8(%%rsp) \n\t"
264 #endif
265 #else
266 #if ARCH_X86_64
267  "mov %5, %%"REG_a" \n\t"
268  "mov %%"REG_a", -8(%%rsp) \n\t"
269 #endif
270 #endif
271  :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
272  "m" (mmxextFilterCode)
273 #if defined(PIC)
274  ,"m" (ebxsave)
275 #endif
276 #if ARCH_X86_64
277  ,"m"(retsave)
278 #endif
279  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
280 #if !defined(PIC)
281  ,"%"REG_b
282 #endif
283  );
284 
285  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
286  dst[i] = src[srcW-1]*128;
287 }
288 
289 void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
290  int dstWidth, const uint8_t *src1,
291  const uint8_t *src2, int srcW, int xInc)
292 {
293  int32_t *filterPos = c->hChrFilterPos;
294  int16_t *filter = c->hChrFilter;
295  void *mmxextFilterCode = c->chrMmxextFilterCode;
296  int i;
297 #if defined(PIC)
298  DECLARE_ALIGNED(8, uint64_t, ebxsave);
299 #endif
300 #if ARCH_X86_64
301  DECLARE_ALIGNED(8, uint64_t, retsave);
302 #endif
303 
304  __asm__ volatile(
305 #if defined(PIC)
306  "mov %%"REG_b", %7 \n\t"
307 #if ARCH_X86_64
308  "mov -8(%%rsp), %%"REG_a" \n\t"
309  "mov %%"REG_a", %8 \n\t"
310 #endif
311 #else
312 #if ARCH_X86_64
313  "mov -8(%%rsp), %%"REG_a" \n\t"
314  "mov %%"REG_a", %7 \n\t"
315 #endif
316 #endif
317  "pxor %%mm7, %%mm7 \n\t"
318  "mov %0, %%"REG_c" \n\t"
319  "mov %1, %%"REG_D" \n\t"
320  "mov %2, %%"REG_d" \n\t"
321  "mov %3, %%"REG_b" \n\t"
322  "xor %%"REG_a", %%"REG_a" \n\t" // i
323  PREFETCH" (%%"REG_c") \n\t"
324  PREFETCH" 32(%%"REG_c") \n\t"
325  PREFETCH" 64(%%"REG_c") \n\t"
326 
327  CALL_MMXEXT_FILTER_CODE
328  CALL_MMXEXT_FILTER_CODE
329  CALL_MMXEXT_FILTER_CODE
330  CALL_MMXEXT_FILTER_CODE
331  "xor %%"REG_a", %%"REG_a" \n\t" // i
332  "mov %5, %%"REG_c" \n\t" // src
333  "mov %6, %%"REG_D" \n\t" // buf2
334  PREFETCH" (%%"REG_c") \n\t"
335  PREFETCH" 32(%%"REG_c") \n\t"
336  PREFETCH" 64(%%"REG_c") \n\t"
337 
338  CALL_MMXEXT_FILTER_CODE
339  CALL_MMXEXT_FILTER_CODE
340  CALL_MMXEXT_FILTER_CODE
341  CALL_MMXEXT_FILTER_CODE
342 
343 #if defined(PIC)
344  "mov %7, %%"REG_b" \n\t"
345 #if ARCH_X86_64
346  "mov %8, %%"REG_a" \n\t"
347  "mov %%"REG_a", -8(%%rsp) \n\t"
348 #endif
349 #else
350 #if ARCH_X86_64
351  "mov %7, %%"REG_a" \n\t"
352  "mov %%"REG_a", -8(%%rsp) \n\t"
353 #endif
354 #endif
355  :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
356  "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
357 #if defined(PIC)
358  ,"m" (ebxsave)
359 #endif
360 #if ARCH_X86_64
361  ,"m"(retsave)
362 #endif
363  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
364 #if !defined(PIC)
365  ,"%"REG_b
366 #endif
367  );
368 
369  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
370  dst1[i] = src1[srcW-1]*128;
371  dst2[i] = src2[srcW-1]*128;
372  }
373 }
374 #endif //HAVE_INLINE_ASM