FFmpeg
snowdsp.c
Go to the documentation of this file.
1 /*
2  * MMX and SSE2 optimized snow DSP utils
3  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <stdint.h>
23 #include "config.h"
24 #include "libavutil/attributes.h"
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/snow_dwt.h"
28 
29 #if HAVE_INLINE_ASM
30 
31 static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
32  const int w2= (width+1)>>1;
33  const int w_l= (width>>1);
34  const int w_r= w2 - 1;
35  int i;
36 
37  { // Lift 0
38  IDWTELEM * const ref = b + w2 - 1;
39  IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
40  // (the first time erroneously), we allow the SSE2 code to run an extra pass.
41  // The savings in code and time are well worth having to store this value and
42  // calculate b[0] correctly afterwards.
43 
44  i = 0;
45  __asm__ volatile(
46  "pcmpeqd %%xmm7, %%xmm7 \n\t"
47  "pcmpeqd %%xmm3, %%xmm3 \n\t"
48  "psllw $1, %%xmm3 \n\t"
49  "paddw %%xmm7, %%xmm3 \n\t"
50  "psllw $13, %%xmm3 \n\t"
51  ::);
52  for(; i<w_l-15; i+=16){
53  __asm__ volatile(
54  "movdqu (%1), %%xmm1 \n\t"
55  "movdqu 16(%1), %%xmm5 \n\t"
56  "movdqu 2(%1), %%xmm2 \n\t"
57  "movdqu 18(%1), %%xmm6 \n\t"
58  "paddw %%xmm1, %%xmm2 \n\t"
59  "paddw %%xmm5, %%xmm6 \n\t"
60  "paddw %%xmm7, %%xmm2 \n\t"
61  "paddw %%xmm7, %%xmm6 \n\t"
62  "pmulhw %%xmm3, %%xmm2 \n\t"
63  "pmulhw %%xmm3, %%xmm6 \n\t"
64  "paddw (%0), %%xmm2 \n\t"
65  "paddw 16(%0), %%xmm6 \n\t"
66  "movdqa %%xmm2, (%0) \n\t"
67  "movdqa %%xmm6, 16(%0) \n\t"
68  :: "r"(&b[i]), "r"(&ref[i])
69  : "memory"
70  );
71  }
73  b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
74  }
75 
76  { // Lift 1
77  IDWTELEM * const dst = b+w2;
78 
79  i = 0;
80  for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
81  dst[i] = dst[i] - (b[i] + b[i + 1]);
82  }
83  for(; i<w_r-15; i+=16){
84  __asm__ volatile(
85  "movdqu (%1), %%xmm1 \n\t"
86  "movdqu 16(%1), %%xmm5 \n\t"
87  "movdqu 2(%1), %%xmm2 \n\t"
88  "movdqu 18(%1), %%xmm6 \n\t"
89  "paddw %%xmm1, %%xmm2 \n\t"
90  "paddw %%xmm5, %%xmm6 \n\t"
91  "movdqa (%0), %%xmm0 \n\t"
92  "movdqa 16(%0), %%xmm4 \n\t"
93  "psubw %%xmm2, %%xmm0 \n\t"
94  "psubw %%xmm6, %%xmm4 \n\t"
95  "movdqa %%xmm0, (%0) \n\t"
96  "movdqa %%xmm4, 16(%0) \n\t"
97  :: "r"(&dst[i]), "r"(&b[i])
98  : "memory"
99  );
100  }
102  }
103 
104  { // Lift 2
105  IDWTELEM * const ref = b+w2 - 1;
106  IDWTELEM b_0 = b[0];
107 
108  i = 0;
109  __asm__ volatile(
110  "psllw $15, %%xmm7 \n\t"
111  "pcmpeqw %%xmm6, %%xmm6 \n\t"
112  "psrlw $13, %%xmm6 \n\t"
113  "paddw %%xmm7, %%xmm6 \n\t"
114  ::);
115  for(; i<w_l-15; i+=16){
116  __asm__ volatile(
117  "movdqu (%1), %%xmm0 \n\t"
118  "movdqu 16(%1), %%xmm4 \n\t"
119  "movdqu 2(%1), %%xmm1 \n\t"
120  "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
121  "paddw %%xmm6, %%xmm0 \n\t"
122  "paddw %%xmm6, %%xmm4 \n\t"
123  "paddw %%xmm7, %%xmm1 \n\t"
124  "paddw %%xmm7, %%xmm5 \n\t"
125  "pavgw %%xmm1, %%xmm0 \n\t"
126  "pavgw %%xmm5, %%xmm4 \n\t"
127  "psubw %%xmm7, %%xmm0 \n\t"
128  "psubw %%xmm7, %%xmm4 \n\t"
129  "psraw $1, %%xmm0 \n\t"
130  "psraw $1, %%xmm4 \n\t"
131  "movdqa (%0), %%xmm1 \n\t"
132  "movdqa 16(%0), %%xmm5 \n\t"
133  "paddw %%xmm1, %%xmm0 \n\t"
134  "paddw %%xmm5, %%xmm4 \n\t"
135  "psraw $2, %%xmm0 \n\t"
136  "psraw $2, %%xmm4 \n\t"
137  "paddw %%xmm1, %%xmm0 \n\t"
138  "paddw %%xmm5, %%xmm4 \n\t"
139  "movdqa %%xmm0, (%0) \n\t"
140  "movdqa %%xmm4, 16(%0) \n\t"
141  :: "r"(&b[i]), "r"(&ref[i])
142  : "memory"
143  );
144  }
146  b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
147  }
148 
149  { // Lift 3
150  IDWTELEM * const src = b+w2;
151 
152  i = 0;
153  for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
154  temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
155  }
156  for(; i<w_r-7; i+=8){
157  __asm__ volatile(
158  "movdqu 2(%1), %%xmm2 \n\t"
159  "movdqu 18(%1), %%xmm6 \n\t"
160  "paddw (%1), %%xmm2 \n\t"
161  "paddw 16(%1), %%xmm6 \n\t"
162  "movdqu (%0), %%xmm0 \n\t"
163  "movdqu 16(%0), %%xmm4 \n\t"
164  "paddw %%xmm2, %%xmm0 \n\t"
165  "paddw %%xmm6, %%xmm4 \n\t"
166  "psraw $1, %%xmm2 \n\t"
167  "psraw $1, %%xmm6 \n\t"
168  "paddw %%xmm0, %%xmm2 \n\t"
169  "paddw %%xmm4, %%xmm6 \n\t"
170  "movdqa %%xmm2, (%2) \n\t"
171  "movdqa %%xmm6, 16(%2) \n\t"
172  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
173  : "memory"
174  );
175  }
177  }
178 
179  {
181 
182  for (; (i & 0x3E) != 0x3E; i-=2){
183  b[i+1] = temp[i>>1];
184  b[i] = b[i>>1];
185  }
186  for (i-=62; i>=0; i-=64){
187  __asm__ volatile(
188  "movdqa (%1), %%xmm0 \n\t"
189  "movdqa 16(%1), %%xmm2 \n\t"
190  "movdqa 32(%1), %%xmm4 \n\t"
191  "movdqa 48(%1), %%xmm6 \n\t"
192  "movdqa (%1), %%xmm1 \n\t"
193  "movdqa 16(%1), %%xmm3 \n\t"
194  "movdqa 32(%1), %%xmm5 \n\t"
195  "movdqa 48(%1), %%xmm7 \n\t"
196  "punpcklwd (%2), %%xmm0 \n\t"
197  "punpcklwd 16(%2), %%xmm2 \n\t"
198  "punpcklwd 32(%2), %%xmm4 \n\t"
199  "punpcklwd 48(%2), %%xmm6 \n\t"
200  "movdqa %%xmm0, (%0) \n\t"
201  "movdqa %%xmm2, 32(%0) \n\t"
202  "movdqa %%xmm4, 64(%0) \n\t"
203  "movdqa %%xmm6, 96(%0) \n\t"
204  "punpckhwd (%2), %%xmm1 \n\t"
205  "punpckhwd 16(%2), %%xmm3 \n\t"
206  "punpckhwd 32(%2), %%xmm5 \n\t"
207  "punpckhwd 48(%2), %%xmm7 \n\t"
208  "movdqa %%xmm1, 16(%0) \n\t"
209  "movdqa %%xmm3, 48(%0) \n\t"
210  "movdqa %%xmm5, 80(%0) \n\t"
211  "movdqa %%xmm7, 112(%0) \n\t"
212  :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
213  : "memory"
214  );
215  }
216  }
217 }
218 
219 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
220  const int w2= (width+1)>>1;
221  const int w_l= (width>>1);
222  const int w_r= w2 - 1;
223  int i;
224 
225  { // Lift 0
226  IDWTELEM * const ref = b + w2 - 1;
227 
228  i = 1;
229  b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
230  __asm__ volatile(
231  "pcmpeqw %%mm7, %%mm7 \n\t"
232  "pcmpeqw %%mm3, %%mm3 \n\t"
233  "psllw $1, %%mm3 \n\t"
234  "paddw %%mm7, %%mm3 \n\t"
235  "psllw $13, %%mm3 \n\t"
236  ::);
237  for(; i<w_l-7; i+=8){
238  __asm__ volatile(
239  "movq (%1), %%mm2 \n\t"
240  "movq 8(%1), %%mm6 \n\t"
241  "paddw 2(%1), %%mm2 \n\t"
242  "paddw 10(%1), %%mm6 \n\t"
243  "paddw %%mm7, %%mm2 \n\t"
244  "paddw %%mm7, %%mm6 \n\t"
245  "pmulhw %%mm3, %%mm2 \n\t"
246  "pmulhw %%mm3, %%mm6 \n\t"
247  "paddw (%0), %%mm2 \n\t"
248  "paddw 8(%0), %%mm6 \n\t"
249  "movq %%mm2, (%0) \n\t"
250  "movq %%mm6, 8(%0) \n\t"
251  :: "r"(&b[i]), "r"(&ref[i])
252  : "memory"
253  );
254  }
256  }
257 
258  { // Lift 1
259  IDWTELEM * const dst = b+w2;
260 
261  i = 0;
262  for(; i<w_r-7; i+=8){
263  __asm__ volatile(
264  "movq (%1), %%mm2 \n\t"
265  "movq 8(%1), %%mm6 \n\t"
266  "paddw 2(%1), %%mm2 \n\t"
267  "paddw 10(%1), %%mm6 \n\t"
268  "movq (%0), %%mm0 \n\t"
269  "movq 8(%0), %%mm4 \n\t"
270  "psubw %%mm2, %%mm0 \n\t"
271  "psubw %%mm6, %%mm4 \n\t"
272  "movq %%mm0, (%0) \n\t"
273  "movq %%mm4, 8(%0) \n\t"
274  :: "r"(&dst[i]), "r"(&b[i])
275  : "memory"
276  );
277  }
279  }
280 
281  { // Lift 2
282  IDWTELEM * const ref = b+w2 - 1;
283 
284  i = 1;
285  b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
286  __asm__ volatile(
287  "psllw $15, %%mm7 \n\t"
288  "pcmpeqw %%mm6, %%mm6 \n\t"
289  "psrlw $13, %%mm6 \n\t"
290  "paddw %%mm7, %%mm6 \n\t"
291  ::);
292  for(; i<w_l-7; i+=8){
293  __asm__ volatile(
294  "movq (%1), %%mm0 \n\t"
295  "movq 8(%1), %%mm4 \n\t"
296  "movq 2(%1), %%mm1 \n\t"
297  "movq 10(%1), %%mm5 \n\t"
298  "paddw %%mm6, %%mm0 \n\t"
299  "paddw %%mm6, %%mm4 \n\t"
300  "paddw %%mm7, %%mm1 \n\t"
301  "paddw %%mm7, %%mm5 \n\t"
302  "pavgw %%mm1, %%mm0 \n\t"
303  "pavgw %%mm5, %%mm4 \n\t"
304  "psubw %%mm7, %%mm0 \n\t"
305  "psubw %%mm7, %%mm4 \n\t"
306  "psraw $1, %%mm0 \n\t"
307  "psraw $1, %%mm4 \n\t"
308  "movq (%0), %%mm1 \n\t"
309  "movq 8(%0), %%mm5 \n\t"
310  "paddw %%mm1, %%mm0 \n\t"
311  "paddw %%mm5, %%mm4 \n\t"
312  "psraw $2, %%mm0 \n\t"
313  "psraw $2, %%mm4 \n\t"
314  "paddw %%mm1, %%mm0 \n\t"
315  "paddw %%mm5, %%mm4 \n\t"
316  "movq %%mm0, (%0) \n\t"
317  "movq %%mm4, 8(%0) \n\t"
318  :: "r"(&b[i]), "r"(&ref[i])
319  : "memory"
320  );
321  }
323  }
324 
325  { // Lift 3
326  IDWTELEM * const src = b+w2;
327  i = 0;
328 
329  for(; i<w_r-7; i+=8){
330  __asm__ volatile(
331  "movq 2(%1), %%mm2 \n\t"
332  "movq 10(%1), %%mm6 \n\t"
333  "paddw (%1), %%mm2 \n\t"
334  "paddw 8(%1), %%mm6 \n\t"
335  "movq (%0), %%mm0 \n\t"
336  "movq 8(%0), %%mm4 \n\t"
337  "paddw %%mm2, %%mm0 \n\t"
338  "paddw %%mm6, %%mm4 \n\t"
339  "psraw $1, %%mm2 \n\t"
340  "psraw $1, %%mm6 \n\t"
341  "paddw %%mm0, %%mm2 \n\t"
342  "paddw %%mm4, %%mm6 \n\t"
343  "movq %%mm2, (%2) \n\t"
344  "movq %%mm6, 8(%2) \n\t"
345  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
346  : "memory"
347  );
348  }
350  }
351 
352  {
354 
355  for (; (i & 0x1E) != 0x1E; i-=2){
356  b[i+1] = temp[i>>1];
357  b[i] = b[i>>1];
358  }
359  for (i-=30; i>=0; i-=32){
360  __asm__ volatile(
361  "movq (%1), %%mm0 \n\t"
362  "movq 8(%1), %%mm2 \n\t"
363  "movq 16(%1), %%mm4 \n\t"
364  "movq 24(%1), %%mm6 \n\t"
365  "movq (%1), %%mm1 \n\t"
366  "movq 8(%1), %%mm3 \n\t"
367  "movq 16(%1), %%mm5 \n\t"
368  "movq 24(%1), %%mm7 \n\t"
369  "punpcklwd (%2), %%mm0 \n\t"
370  "punpcklwd 8(%2), %%mm2 \n\t"
371  "punpcklwd 16(%2), %%mm4 \n\t"
372  "punpcklwd 24(%2), %%mm6 \n\t"
373  "movq %%mm0, (%0) \n\t"
374  "movq %%mm2, 16(%0) \n\t"
375  "movq %%mm4, 32(%0) \n\t"
376  "movq %%mm6, 48(%0) \n\t"
377  "punpckhwd (%2), %%mm1 \n\t"
378  "punpckhwd 8(%2), %%mm3 \n\t"
379  "punpckhwd 16(%2), %%mm5 \n\t"
380  "punpckhwd 24(%2), %%mm7 \n\t"
381  "movq %%mm1, 8(%0) \n\t"
382  "movq %%mm3, 24(%0) \n\t"
383  "movq %%mm5, 40(%0) \n\t"
384  "movq %%mm7, 56(%0) \n\t"
385  :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
386  : "memory"
387  );
388  }
389  }
390 }
391 
392 #if HAVE_7REGS
393 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
394  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
395  ""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\
396  ""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\
397  ""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t"
398 
399 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
400  snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
401 
402 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
403  snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
404 
405 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
406  "psubw %%"s0", %%"t0" \n\t"\
407  "psubw %%"s1", %%"t1" \n\t"\
408  "psubw %%"s2", %%"t2" \n\t"\
409  "psubw %%"s3", %%"t3" \n\t"
410 
411 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
412  "movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\
413  "movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\
414  "movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\
415  "movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t"
416 
417 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
418  "psraw $"n", %%"t0" \n\t"\
419  "psraw $"n", %%"t1" \n\t"\
420  "psraw $"n", %%"t2" \n\t"\
421  "psraw $"n", %%"t3" \n\t"
422 
423 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
424  "paddw %%"s0", %%"t0" \n\t"\
425  "paddw %%"s1", %%"t1" \n\t"\
426  "paddw %%"s2", %%"t2" \n\t"\
427  "paddw %%"s3", %%"t3" \n\t"
428 
429 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
430  "pmulhw %%"s0", %%"t0" \n\t"\
431  "pmulhw %%"s1", %%"t1" \n\t"\
432  "pmulhw %%"s2", %%"t2" \n\t"\
433  "pmulhw %%"s3", %%"t3" \n\t"
434 
435 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
436  "movdqa %%"s0", %%"t0" \n\t"\
437  "movdqa %%"s1", %%"t1" \n\t"\
438  "movdqa %%"s2", %%"t2" \n\t"\
439  "movdqa %%"s3", %%"t3" \n\t"
440 
441 static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
442  x86_reg i = width;
443 
444  while(i & 0x1F)
445  {
446  i--;
447  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
448  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
449  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
450  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
451  }
452  i+=i;
453 
454  __asm__ volatile (
455  "jmp 2f \n\t"
456  "1: \n\t"
457  snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
458  snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
459 
460 
461  "pcmpeqw %%xmm0, %%xmm0 \n\t"
462  "pcmpeqw %%xmm2, %%xmm2 \n\t"
463  "paddw %%xmm2, %%xmm2 \n\t"
464  "paddw %%xmm0, %%xmm2 \n\t"
465  "psllw $13, %%xmm2 \n\t"
466  snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
467  snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
468  snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
469  snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
470  snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
471  snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
472  snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
473  snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
474 
475  "pcmpeqw %%xmm7, %%xmm7 \n\t"
476  "pcmpeqw %%xmm5, %%xmm5 \n\t"
477  "psllw $15, %%xmm7 \n\t"
478  "psrlw $13, %%xmm5 \n\t"
479  "paddw %%xmm7, %%xmm5 \n\t"
480  snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
481  "movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
482  "movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
483  "paddw %%xmm7, %%xmm1 \n\t"
484  "paddw %%xmm7, %%xmm3 \n\t"
485  "pavgw %%xmm1, %%xmm0 \n\t"
486  "pavgw %%xmm3, %%xmm2 \n\t"
487  "movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
488  "movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
489  "paddw %%xmm7, %%xmm1 \n\t"
490  "paddw %%xmm7, %%xmm3 \n\t"
491  "pavgw %%xmm1, %%xmm4 \n\t"
492  "pavgw %%xmm3, %%xmm6 \n\t"
493  snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
494  snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
495  snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
496 
497  snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
498  snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
499  snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
500  snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
501  snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
502  snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
503  snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
504  snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
505  snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
506 
507  "2: \n\t"
508  "sub $64, %%"FF_REG_d" \n\t"
509  "jge 1b \n\t"
510  :"+d"(i)
511  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
512 }
513 
514 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
515  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
516  ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
517  ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
518  ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
519 
520 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
521  snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
522 
523 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
524  snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
525 
526 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
527  "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
528  "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
529  "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
530  "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
531 
532 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
533  "movq %%"s0", %%"t0" \n\t"\
534  "movq %%"s1", %%"t1" \n\t"\
535  "movq %%"s2", %%"t2" \n\t"\
536  "movq %%"s3", %%"t3" \n\t"
537 
538 
539 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
540  x86_reg i = width;
541  while(i & 15)
542  {
543  i--;
544  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
545  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
546  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
547  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
548  }
549  i+=i;
550  __asm__ volatile(
551  "jmp 2f \n\t"
552  "1: \n\t"
553 
554  snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
555  snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
556  "pcmpeqw %%mm0, %%mm0 \n\t"
557  "pcmpeqw %%mm2, %%mm2 \n\t"
558  "paddw %%mm2, %%mm2 \n\t"
559  "paddw %%mm0, %%mm2 \n\t"
560  "psllw $13, %%mm2 \n\t"
561  snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
562  snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
563  snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
564  snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
565  snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
566  snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
567  snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
568  snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
569  "pcmpeqw %%mm7, %%mm7 \n\t"
570  "pcmpeqw %%mm5, %%mm5 \n\t"
571  "psllw $15, %%mm7 \n\t"
572  "psrlw $13, %%mm5 \n\t"
573  "paddw %%mm7, %%mm5 \n\t"
574  snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
575  "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
576  "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
577  "paddw %%mm7, %%mm1 \n\t"
578  "paddw %%mm7, %%mm3 \n\t"
579  "pavgw %%mm1, %%mm0 \n\t"
580  "pavgw %%mm3, %%mm2 \n\t"
581  "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
582  "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
583  "paddw %%mm7, %%mm1 \n\t"
584  "paddw %%mm7, %%mm3 \n\t"
585  "pavgw %%mm1, %%mm4 \n\t"
586  "pavgw %%mm3, %%mm6 \n\t"
587  snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
588  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
589  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
590 
591  snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
592  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
593  snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
594  snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
595  snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
596  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
597  snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
598  snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
599  snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
600 
601  "2: \n\t"
602  "sub $32, %%"FF_REG_d" \n\t"
603  "jge 1b \n\t"
604  :"+d"(i)
605  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
606 }
607 #endif //HAVE_7REGS
608 
609 #if HAVE_6REGS
610 #define snow_inner_add_yblock_sse2_header \
611  IDWTELEM * * dst_array = sb->line + src_y;\
612  x86_reg tmp;\
613  __asm__ volatile(\
614  "mov %7, %%"FF_REG_c" \n\t"\
615  "mov %6, %2 \n\t"\
616  "mov %4, %%"FF_REG_S" \n\t"\
617  "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
618  "pcmpeqd %%xmm3, %%xmm3 \n\t"\
619  "psllw $15, %%xmm3 \n\t"\
620  "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
621  "1: \n\t"\
622  "mov %1, %%"FF_REG_D" \n\t"\
623  "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
624  "add %3, %%"FF_REG_D" \n\t"
625 
626 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
627  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
628  "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
629  "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\
630  "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
631  "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
632  "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
633  "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
634  "punpcklbw %%xmm7, %%xmm0 \n\t"\
635  "punpcklbw %%xmm7, %%xmm4 \n\t"\
636  "pmullw %%xmm0, %%"out_reg1" \n\t"\
637  "pmullw %%xmm4, %%"out_reg2" \n\t"
638 
639 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
640  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
641  "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
642  "movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\
643  "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
644  "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
645  "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
646  "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
647  "punpcklbw %%xmm7, %%xmm0 \n\t"\
648  "punpcklbw %%xmm7, %%xmm4 \n\t"\
649  "pmullw %%xmm0, %%"out_reg1" \n\t"\
650  "pmullw %%xmm4, %%"out_reg2" \n\t"
651 
652 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
653  snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
654  "paddusw %%xmm2, %%xmm1 \n\t"\
655  "paddusw %%xmm6, %%xmm5 \n\t"
656 
657 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
658  snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
659  "paddusw %%xmm2, %%xmm1 \n\t"\
660  "paddusw %%xmm6, %%xmm5 \n\t"
661 
662 #define snow_inner_add_yblock_sse2_end_common1\
663  "add $32, %%"FF_REG_S" \n\t"\
664  "add %%"FF_REG_c", %0 \n\t"\
665  "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
666  "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
667  "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
668  "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"
669 
670 #define snow_inner_add_yblock_sse2_end_common2\
671  "jnz 1b \n\t"\
672  :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
673  :\
674  "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
675  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
676  "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
677 
678 #define snow_inner_add_yblock_sse2_end_8\
679  "sal $1, %%"FF_REG_c" \n\t"\
680  "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
681  snow_inner_add_yblock_sse2_end_common1\
682  "sar $1, %%"FF_REG_c" \n\t"\
683  "sub $2, %2 \n\t"\
684  snow_inner_add_yblock_sse2_end_common2
685 
686 #define snow_inner_add_yblock_sse2_end_16\
687  "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
688  snow_inner_add_yblock_sse2_end_common1\
689  "dec %2 \n\t"\
690  snow_inner_add_yblock_sse2_end_common2
691 
692 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
693  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
694 snow_inner_add_yblock_sse2_header
695 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
696 snow_inner_add_yblock_sse2_accum_8("2", "8")
697 snow_inner_add_yblock_sse2_accum_8("1", "128")
698 snow_inner_add_yblock_sse2_accum_8("0", "136")
699 
700  "mov %0, %%"FF_REG_d" \n\t"
701  "movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
702  "movdqa %%xmm1, %%xmm2 \n\t"
703 
704  "punpckhwd %%xmm7, %%xmm1 \n\t"
705  "punpcklwd %%xmm7, %%xmm2 \n\t"
706  "paddd %%xmm2, %%xmm0 \n\t"
707  "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
708  "paddd %%xmm1, %%xmm2 \n\t"
709  "paddd %%xmm3, %%xmm0 \n\t"
710  "paddd %%xmm3, %%xmm2 \n\t"
711 
712  "mov %1, %%"FF_REG_D" \n\t"
713  "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
714  "add %3, %%"FF_REG_D" \n\t"
715 
716  "movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
717  "movdqa %%xmm5, %%xmm6 \n\t"
718  "punpckhwd %%xmm7, %%xmm5 \n\t"
719  "punpcklwd %%xmm7, %%xmm6 \n\t"
720  "paddd %%xmm6, %%xmm4 \n\t"
721  "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
722  "paddd %%xmm5, %%xmm6 \n\t"
723  "paddd %%xmm3, %%xmm4 \n\t"
724  "paddd %%xmm3, %%xmm6 \n\t"
725 
726  "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
727  "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
728  "packssdw %%xmm2, %%xmm0 \n\t"
729  "packuswb %%xmm7, %%xmm0 \n\t"
730  "movq %%xmm0, (%%"FF_REG_d") \n\t"
731 
732  "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
733  "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
734  "packssdw %%xmm6, %%xmm4 \n\t"
735  "packuswb %%xmm7, %%xmm4 \n\t"
736  "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
737 snow_inner_add_yblock_sse2_end_8
738 }
739 
740 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
741  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
742 snow_inner_add_yblock_sse2_header
743 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
744 snow_inner_add_yblock_sse2_accum_16("2", "16")
745 snow_inner_add_yblock_sse2_accum_16("1", "512")
746 snow_inner_add_yblock_sse2_accum_16("0", "528")
747 
748  "mov %0, %%"FF_REG_d" \n\t"
749  "psrlw $4, %%xmm1 \n\t"
750  "psrlw $4, %%xmm5 \n\t"
751  "paddw (%%"FF_REG_D"), %%xmm1 \n\t"
752  "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
753  "paddw %%xmm3, %%xmm1 \n\t"
754  "paddw %%xmm3, %%xmm5 \n\t"
755  "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
756  "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
757  "packuswb %%xmm5, %%xmm1 \n\t"
758 
759  "movdqu %%xmm1, (%%"FF_REG_d") \n\t"
760 
761 snow_inner_add_yblock_sse2_end_16
762 }
763 
764 #define snow_inner_add_yblock_mmx_header \
765  IDWTELEM * * dst_array = sb->line + src_y;\
766  x86_reg tmp;\
767  __asm__ volatile(\
768  "mov %7, %%"FF_REG_c" \n\t"\
769  "mov %6, %2 \n\t"\
770  "mov %4, %%"FF_REG_S" \n\t"\
771  "pxor %%mm7, %%mm7 \n\t" /* 0 */\
772  "pcmpeqd %%mm3, %%mm3 \n\t"\
773  "psllw $15, %%mm3 \n\t"\
774  "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
775  "1: \n\t"\
776  "mov %1, %%"FF_REG_D" \n\t"\
777  "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
778  "add %3, %%"FF_REG_D" \n\t"
779 
780 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
781  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
782  "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\
783  "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\
784  "punpcklbw %%mm7, %%"out_reg1" \n\t"\
785  "punpcklbw %%mm7, %%"out_reg2" \n\t"\
786  "movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\
787  "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
788  "punpcklbw %%mm7, %%mm0 \n\t"\
789  "punpcklbw %%mm7, %%mm4 \n\t"\
790  "pmullw %%mm0, %%"out_reg1" \n\t"\
791  "pmullw %%mm4, %%"out_reg2" \n\t"
792 
793 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
794  snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
795  "paddusw %%mm2, %%mm1 \n\t"\
796  "paddusw %%mm6, %%mm5 \n\t"
797 
798 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
799  "mov %0, %%"FF_REG_d" \n\t"\
800  "psrlw $4, %%mm1 \n\t"\
801  "psrlw $4, %%mm5 \n\t"\
802  "paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\
803  "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
804  "paddw %%mm3, %%mm1 \n\t"\
805  "paddw %%mm3, %%mm5 \n\t"\
806  "psraw $4, %%mm1 \n\t"\
807  "psraw $4, %%mm5 \n\t"\
808  "packuswb %%mm5, %%mm1 \n\t"\
809  "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
810 
811 #define snow_inner_add_yblock_mmx_end(s_step)\
812  "add $"s_step", %%"FF_REG_S" \n\t"\
813  "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
814  "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
815  "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
816  "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\
817  "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\
818  "add %%"FF_REG_c", %0 \n\t"\
819  "dec %2 \n\t"\
820  "jnz 1b \n\t"\
821  :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
822  :\
823  "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
824  "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
825 
826 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
827  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
828 snow_inner_add_yblock_mmx_header
829 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
830 snow_inner_add_yblock_mmx_accum("2", "8", "0")
831 snow_inner_add_yblock_mmx_accum("1", "128", "0")
832 snow_inner_add_yblock_mmx_accum("0", "136", "0")
833 snow_inner_add_yblock_mmx_mix("0", "0")
834 snow_inner_add_yblock_mmx_end("16")
835 }
836 
837 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
838  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
839 snow_inner_add_yblock_mmx_header
840 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
841 snow_inner_add_yblock_mmx_accum("2", "16", "0")
842 snow_inner_add_yblock_mmx_accum("1", "512", "0")
843 snow_inner_add_yblock_mmx_accum("0", "528", "0")
844 snow_inner_add_yblock_mmx_mix("0", "0")
845 
846 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
847 snow_inner_add_yblock_mmx_accum("2", "24", "8")
848 snow_inner_add_yblock_mmx_accum("1", "520", "8")
849 snow_inner_add_yblock_mmx_accum("0", "536", "8")
850 snow_inner_add_yblock_mmx_mix("16", "8")
851 snow_inner_add_yblock_mmx_end("32")
852 }
853 
854 static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
855  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
856 
857  if (b_w == 16)
858  inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
859  else if (b_w == 8 && obmc_stride == 16) {
860  if (!(b_h & 1))
861  inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
862  else
863  inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
864  } else
865  ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
866 }
867 
868 static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
869  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
870  if (b_w == 16)
871  inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
872  else if (b_w == 8 && obmc_stride == 16)
873  inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
874  else
875  ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
876 }
877 #endif /* HAVE_6REGS */
878 
879 #endif /* HAVE_INLINE_ASM */
880 
882 {
883 #if HAVE_INLINE_ASM
884  int mm_flags = av_get_cpu_flags();
885 
886  if (mm_flags & AV_CPU_FLAG_MMX) {
887  if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
888  c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
889 #if HAVE_7REGS
890  c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
891 #endif
892 #if HAVE_6REGS
893  c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
894 #endif
895  }
896  else{
897  if (mm_flags & AV_CPU_FLAG_MMXEXT) {
898  c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
899 #if HAVE_7REGS
900  c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
901 #endif
902  }
903 #if HAVE_6REGS
904  c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
905 #endif
906  }
907  }
908 #endif /* HAVE_INLINE_ASM */
909 }
r
const char * r
Definition: vf_curves.c:127
W_AO
#define W_AO
Definition: snow_dwt.h:75
W_BO
#define W_BO
Definition: snow_dwt.h:80
W_DS
#define W_DS
Definition: snow_dwt.h:89
x86_reg
int x86_reg
Definition: asm.h:72
W_AM
#define W_AM
Definition: snow_dwt.h:74
b
#define b
Definition: input.c:41
SnowDWTContext
Definition: snow_dwt.h:58
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:107
W_CS
#define W_CS
Definition: snow_dwt.h:85
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2034
av_cold
#define av_cold
Definition: attributes.h:90
b3
static double b3(void *priv, double x, double y)
Definition: vf_xfade.c:2036
W_DO
#define W_DO
Definition: snow_dwt.h:88
W_BM
#define W_BM
Definition: snow_dwt.h:79
ff_dwt_init_x86
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
Definition: snowdsp.c:881
ff_snow_inner_add_yblock
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
Definition: snow.c:30
asm.h
W_AS
#define W_AS
Definition: snow_dwt.h:76
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
W_DM
#define W_DM
Definition: snow_dwt.h:87
AV_CPU_FLAG_SSE2
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:35
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:2035
attributes.h
snow_dwt.h
snow_horizontal_compose_lift_lead_out
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, const IDWTELEM *src, const IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
Definition: snow_dwt.h:116
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
W_BS
#define W_BS
Definition: snow_dwt.h:81
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
AV_CPU_FLAG_MMX
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:30
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:117
temp
else temp
Definition: vf_mcdeint.c:263
W_CM
#define W_CM
Definition: snow_dwt.h:83
AV_CPU_FLAG_MMXEXT
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:31
snow_horizontal_compose_liftS_lead_out
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, const IDWTELEM *src, const IDWTELEM *ref, int width, int w)
Definition: snow_dwt.h:125
snow_interleave_line_header
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
Definition: snow_dwt.h:98
W_CO
#define W_CO
Definition: snow_dwt.h:84
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
IDWTELEM
short IDWTELEM
Definition: dirac_dwt.h:27
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:2033
width
#define width
Definition: dsp.h:85
src
#define src
Definition: vp8dsp.c:248